Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 479 results for author: <span class="mathjax">Lee, G</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lee%2C+G">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lee, G"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lee%2C+G&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lee, G"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lee%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lee%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lee%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lee%2C+G&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lee%2C+G&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lee%2C+G&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18072">arXiv:2411.18072</a> <span> [<a href="https://arxiv.org/pdf/2411.18072">pdf</a>, <a href="https://arxiv.org/format/2411.18072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SmileSplat: Generalizable Gaussian Splats for Unconstrained Sparse Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanyan Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yixin Fang</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18072v1-abstract-short" style="display: inline;"> Sparse Multi-view Images can be Learned to predict explicit radiance fields via Generalizable Gaussian Splatting approaches, which can achieve wider application prospects in real-life when ground-truth camera parameters are not required as inputs. In this paper, a novel generalizable Gaussian Splatting method, SmileSplat, is proposed to reconstruct pixel-aligned Gaussian surfels for diverse scenar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18072v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18072v1-abstract-full" style="display: none;"> Sparse Multi-view Images can be Learned to predict explicit radiance fields via Generalizable Gaussian Splatting approaches, which can achieve wider application prospects in real-life when ground-truth camera parameters are not required as inputs. In this paper, a novel generalizable Gaussian Splatting method, SmileSplat, is proposed to reconstruct pixel-aligned Gaussian surfels for diverse scenarios only requiring unconstrained sparse multi-view images. First, Gaussian surfels are predicted based on the multi-head Gaussian regression decoder, which can are represented with less degree-of-freedom but have better multi-view consistency. Furthermore, the normal vectors of Gaussian surfel are enhanced based on high-quality of normal priors. Second, the Gaussians and camera parameters (both extrinsic and intrinsic) are optimized to obtain high-quality Gaussian radiance fields for novel view synthesis tasks based on the proposed Bundle-Adjusting Gaussian Splatting module. Extensive experiments on novel view rendering and depth map prediction tasks are conducted on public datasets, demonstrating that the proposed method achieves state-of-the-art performance in various 3D vision tasks. More information can be found on our project page (https://yanyan-li.github.io/project/gs/smilesplat) <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18072v1-abstract-full').style.display = 'none'; document.getElementById('2411.18072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17030">arXiv:2411.17030</a> <span> [<a href="https://arxiv.org/pdf/2411.17030">pdf</a>, <a href="https://arxiv.org/format/2411.17030">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> g3D-LF: Generalizable 3D-Language Feature Fields for Embodied Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17030v1-abstract-short" style="display: inline;"> We introduce Generalizable 3D-Language Feature Fields (g3D-LF), a 3D representation model pre-trained on large-scale 3D-language dataset for embodied tasks. Our g3D-LF processes posed RGB-D images from agents to encode feature fields for: 1) Novel view representation predictions from any position in the 3D scene; 2) Generations of BEV maps centered on the agent; 3) Querying targets using multi-gra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17030v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17030v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17030v1-abstract-full" style="display: none;"> We introduce Generalizable 3D-Language Feature Fields (g3D-LF), a 3D representation model pre-trained on large-scale 3D-language dataset for embodied tasks. Our g3D-LF processes posed RGB-D images from agents to encode feature fields for: 1) Novel view representation predictions from any position in the 3D scene; 2) Generations of BEV maps centered on the agent; 3) Querying targets using multi-granularity language within the above-mentioned representations. Our representation can be generalized to unseen environments, enabling real-time construction and dynamic updates. By volume rendering latent features along sampled rays and integrating semantic and spatial relationships through multiscale encoders, our g3D-LF produces representations at different scales and perspectives, aligned with multi-granularity language, via multi-level contrastive learning. Furthermore, we prepare a large-scale 3D-language dataset to align the representations of the feature fields with language. Extensive experiments on Vision-and-Language Navigation under both Panorama and Monocular settings, Zero-shot Object Navigation, and Situated Question Answering tasks highlight the significant advantages and effectiveness of our g3D-LF for embodied tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17030v1-abstract-full').style.display = 'none'; document.getElementById('2411.17030v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15779">arXiv:2411.15779</a> <span> [<a href="https://arxiv.org/pdf/2411.15779">pdf</a>, <a href="https://arxiv.org/format/2411.15779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ZeroGS: Training 3D Gaussian Splatting from Unposed Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Potamias%2C+R+A">Rolandos Alexandros Potamias</a>, <a href="/search/cs?searchtype=author&query=Ververas%2C+E">Evangelos Ververas</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jifei Song</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiankang Deng</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15779v1-abstract-short" style="display: inline;"> Neural radiance fields (NeRF) and 3D Gaussian Splatting (3DGS) are popular techniques to reconstruct and render photo-realistic images. However, the pre-requisite of running Structure-from-Motion (SfM) to get camera poses limits their completeness. While previous methods can reconstruct from a few unposed images, they are not applicable when images are unordered or densely captured. In this work,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15779v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15779v1-abstract-full" style="display: none;"> Neural radiance fields (NeRF) and 3D Gaussian Splatting (3DGS) are popular techniques to reconstruct and render photo-realistic images. However, the pre-requisite of running Structure-from-Motion (SfM) to get camera poses limits their completeness. While previous methods can reconstruct from a few unposed images, they are not applicable when images are unordered or densely captured. In this work, we propose ZeroGS to train 3DGS from hundreds of unposed and unordered images. Our method leverages a pretrained foundation model as the neural scene representation. Since the accuracy of the predicted pointmaps does not suffice for accurate image registration and high-fidelity image rendering, we propose to mitigate the issue by initializing and finetuning the pretrained model from a seed image. Images are then progressively registered and added to the training buffer, which is further used to train the model. We also propose to refine the camera poses and pointmaps by minimizing a point-to-camera ray consistency loss across multiple views. Experiments on the LLFF dataset, the MipNeRF360 dataset, and the Tanks-and-Temples dataset show that our method recovers more accurate camera poses than state-of-the-art pose-free NeRF/3DGS methods, and even renders higher quality images than 3DGS with COLMAP poses. Our project page is available at https://aibluefisher.github.io/ZeroGS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15779v1-abstract-full').style.display = 'none'; document.getElementById('2411.15779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12460">arXiv:2411.12460</a> <span> [<a href="https://arxiv.org/pdf/2411.12460">pdf</a>, <a href="https://arxiv.org/format/2411.12460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Guide-to-Explain for Controllable Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ryu%2C+S">Sangwon Ryu</a>, <a href="/search/cs?searchtype=author&query=Do%2C+H">Heejin Do</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Daehee Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yunsu Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a>, <a href="/search/cs?searchtype=author&query=Ok%2C+J">Jungseul Ok</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12460v1-abstract-short" style="display: inline;"> Recently, large language models (LLMs) have demonstrated remarkable performance in abstractive summarization tasks. However, controllable summarization with LLMs remains underexplored, limiting their ability to generate summaries that align with specific user preferences. In this paper, we first investigate the capability of LLMs to control diverse attributes, revealing that they encounter greater… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12460v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12460v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12460v1-abstract-full" style="display: none;"> Recently, large language models (LLMs) have demonstrated remarkable performance in abstractive summarization tasks. However, controllable summarization with LLMs remains underexplored, limiting their ability to generate summaries that align with specific user preferences. In this paper, we first investigate the capability of LLMs to control diverse attributes, revealing that they encounter greater challenges with numerical attributes, such as length and extractiveness, compared to linguistic attributes. To address this challenge, we propose a guide-to-explain framework (GTE) for controllable summarization. Our GTE framework enables the model to identify misaligned attributes in the initial draft and guides it in explaining errors in the previous output. Based on this reflection, the model generates a well-adjusted summary. As a result, by allowing the model to reflect on its misalignment, we generate summaries that satisfy the desired attributes in surprisingly fewer iterations than other iterative methods solely using LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12460v1-abstract-full').style.display = 'none'; document.getElementById('2411.12460v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11903">arXiv:2411.11903</a> <span> [<a href="https://arxiv.org/pdf/2411.11903">pdf</a>, <a href="https://arxiv.org/format/2411.11903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiHuR: Diffusion-Guided Generalizable Human Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinnan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11903v1-abstract-short" style="display: inline;"> We introduce DiHuR, a novel Diffusion-guided model for generalizable Human 3D Reconstruction and view synthesis from sparse, minimally overlapping images. While existing generalizable human radiance fields excel at novel view synthesis, they often struggle with comprehensive 3D reconstruction. Similarly, directly optimizing implicit Signed Distance Function (SDF) fields from sparse-view images typ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11903v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11903v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11903v1-abstract-full" style="display: none;"> We introduce DiHuR, a novel Diffusion-guided model for generalizable Human 3D Reconstruction and view synthesis from sparse, minimally overlapping images. While existing generalizable human radiance fields excel at novel view synthesis, they often struggle with comprehensive 3D reconstruction. Similarly, directly optimizing implicit Signed Distance Function (SDF) fields from sparse-view images typically yields poor results due to limited overlap. To enhance 3D reconstruction quality, we propose using learnable tokens associated with SMPL vertices to aggregate sparse view features and then to guide SDF prediction. These tokens learn a generalizable prior across different identities in training datasets, leveraging the consistent projection of SMPL vertices onto similar semantic areas across various human identities. This consistency enables effective knowledge transfer to unseen identities during inference. Recognizing SMPL's limitations in capturing clothing details, we incorporate a diffusion model as an additional prior to fill in missing information, particularly for complex clothing geometries. Our method integrates two key priors in a coherent manner: the prior from generalizable feed-forward models and the 2D diffusion prior, and it requires only multi-view image training, without 3D supervision. DiHuR demonstrates superior performance in both within-dataset and cross-dataset generalization settings, as validated on THuman, ZJU-MoCap, and HuMMan datasets compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11903v1-abstract-full').style.display = 'none'; document.getElementById('2411.11903v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02551">arXiv:2411.02551</a> <span> [<a href="https://arxiv.org/pdf/2411.02551">pdf</a>, <a href="https://arxiv.org/format/2411.02551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> PIAST: A Multimodal Piano Dataset with Audio, Symbolic and Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bang%2C+H">Hayeon Bang</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Eunjin Choi</a>, <a href="/search/cs?searchtype=author&query=Finch%2C+M">Megan Finch</a>, <a href="/search/cs?searchtype=author&query=Doh%2C+S">Seungheon Doh</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seolhee Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gyeong-Hoon Lee</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+J">Juhan Nam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02551v2-abstract-short" style="display: inline;"> While piano music has become a significant area of study in Music Information Retrieval (MIR), there is a notable lack of datasets for piano solo music with text labels. To address this gap, we present PIAST (PIano dataset with Audio, Symbolic, and Text), a piano music dataset. Utilizing a piano-specific taxonomy of semantic tags, we collected 9,673 tracks from YouTube and added human annotations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02551v2-abstract-full').style.display = 'inline'; document.getElementById('2411.02551v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02551v2-abstract-full" style="display: none;"> While piano music has become a significant area of study in Music Information Retrieval (MIR), there is a notable lack of datasets for piano solo music with text labels. To address this gap, we present PIAST (PIano dataset with Audio, Symbolic, and Text), a piano music dataset. Utilizing a piano-specific taxonomy of semantic tags, we collected 9,673 tracks from YouTube and added human annotations for 2,023 tracks by music experts, resulting in two subsets: PIAST-YT and PIAST-AT. Both include audio, text, tag annotations, and transcribed MIDI utilizing state-of-the-art piano transcription and beat tracking models. Among many possible tasks with the multi-modal dataset, we conduct music tagging and retrieval using both audio and MIDI data and report baseline performances to demonstrate its potential as a valuable resource for MIR research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02551v2-abstract-full').style.display = 'none'; document.getElementById('2411.02551v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at the 3rd Workshop on NLP for Music and Audio (NLP4MusA 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02319">arXiv:2411.02319</a> <span> [<a href="https://arxiv.org/pdf/2411.02319">pdf</a>, <a href="https://arxiv.org/format/2411.02319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GenXD: Generating Any 3D and 4D Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chung-Ching Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kevin Lin</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhiwen Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02319v2-abstract-short" style="display: inline;"> Recent developments in 2D visual generation have been remarkably successful. However, 3D and 4D generation remain challenging in real-world applications due to the lack of large-scale 4D data and effective model design. In this paper, we propose to jointly investigate general 3D and 4D generation by leveraging camera and object movements commonly observed in daily life. Due to the lack of real-wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02319v2-abstract-full').style.display = 'inline'; document.getElementById('2411.02319v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02319v2-abstract-full" style="display: none;"> Recent developments in 2D visual generation have been remarkably successful. However, 3D and 4D generation remain challenging in real-world applications due to the lack of large-scale 4D data and effective model design. In this paper, we propose to jointly investigate general 3D and 4D generation by leveraging camera and object movements commonly observed in daily life. Due to the lack of real-world 4D data in the community, we first propose a data curation pipeline to obtain camera poses and object motion strength from videos. Based on this pipeline, we introduce a large-scale real-world 4D scene dataset: CamVid-30K. By leveraging all the 3D and 4D data, we develop our framework, GenXD, which allows us to produce any 3D or 4D scene. We propose multiview-temporal modules, which disentangle camera and object movements, to seamlessly learn from both 3D and 4D data. Additionally, GenXD employs masked latent conditions to support a variety of conditioning views. GenXD can generate videos that follow the camera trajectory as well as consistent 3D views that can be lifted into 3D representations. We perform extensive evaluations across various real-world and synthetic datasets, demonstrating GenXD's effectiveness and versatility compared to previous methods in 3D and 4D generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02319v2-abstract-full').style.display = 'none'; document.getElementById('2411.02319v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00686">arXiv:2411.00686</a> <span> [<a href="https://arxiv.org/pdf/2411.00686">pdf</a>, <a href="https://arxiv.org/format/2411.00686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Latent Paraphrasing: Perturbation on Layers Improves Knowledge Injection in Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+M">Minki Kang</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S+J">Sung Ju Hwang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gibbeum Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaewoong Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00686v1-abstract-short" style="display: inline;"> As Large Language Models (LLMs) are increasingly deployed in specialized domains with continuously evolving knowledge, the need for timely and precise knowledge injection has become essential. Fine-tuning with paraphrased data is a common approach to enhance knowledge injection, yet it faces two significant challenges: high computational costs due to repetitive external model usage and limited sam… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00686v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00686v1-abstract-full" style="display: none;"> As Large Language Models (LLMs) are increasingly deployed in specialized domains with continuously evolving knowledge, the need for timely and precise knowledge injection has become essential. Fine-tuning with paraphrased data is a common approach to enhance knowledge injection, yet it faces two significant challenges: high computational costs due to repetitive external model usage and limited sample diversity. To this end, we introduce LaPael, a latent-level paraphrasing method that applies input-dependent noise to early LLM layers. This approach enables diverse and semantically consistent augmentations directly within the model. Furthermore, it eliminates the recurring costs of paraphrase generation for each knowledge update. Our extensive experiments on question-answering benchmarks demonstrate that LaPael improves knowledge injection over standard fine-tuning and existing noise-based approaches. Additionally, combining LaPael with data-level paraphrasing further enhances performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00686v1-abstract-full').style.display = 'none'; document.getElementById('2411.00686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00029">arXiv:2411.00029</a> <span> [<a href="https://arxiv.org/pdf/2411.00029">pdf</a>, <a href="https://arxiv.org/format/2411.00029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Preserving Pre-trained Representation Space: On Effectiveness of Prefix-tuning for Large Multi-modal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+D">Donghoon Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gusang Lee</a>, <a href="/search/cs?searchtype=author&query=Shim%2C+K">Kyuhong Shim</a>, <a href="/search/cs?searchtype=author&query=Shim%2C+B">Byonghyo Shim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00029v1-abstract-short" style="display: inline;"> Recently, we have observed that Large Multi-modal Models (LMMs) are revolutionizing the way machines interact with the world, unlocking new possibilities across various multi-modal applications. To adapt LMMs for downstream tasks, parameter-efficient fine-tuning (PEFT) which only trains additional prefix tokens or modules, has gained popularity. Nevertheless, there has been little analysis of how… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00029v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00029v1-abstract-full" style="display: none;"> Recently, we have observed that Large Multi-modal Models (LMMs) are revolutionizing the way machines interact with the world, unlocking new possibilities across various multi-modal applications. To adapt LMMs for downstream tasks, parameter-efficient fine-tuning (PEFT) which only trains additional prefix tokens or modules, has gained popularity. Nevertheless, there has been little analysis of how PEFT works in LMMs. In this paper, we delve into the strengths and weaknesses of each tuning strategy, shifting the focus from the efficiency typically associated with these approaches. We first discover that model parameter tuning methods such as LoRA and Adapters distort the feature representation space learned during pre-training and limit the full utilization of pre-trained knowledge. We also demonstrate that prefix-tuning excels at preserving the representation space, despite its lower performance on downstream tasks. These findings suggest a simple two-step PEFT strategy called Prefix-Tuned PEFT (PT-PEFT), which successively performs prefix-tuning and then PEFT (i.e., Adapter, LoRA), combines the benefits of both. Experimental results show that PT-PEFT not only improves performance in image captioning and visual question answering compared to vanilla PEFT methods but also helps preserve the representation space of the four pre-trained models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00029v1-abstract-full').style.display = 'none'; document.getElementById('2411.00029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21836">arXiv:2410.21836</a> <span> [<a href="https://arxiv.org/pdf/2410.21836">pdf</a>, <a href="https://arxiv.org/format/2410.21836">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-aspect Depression Severity Assessment via Inductive Dialogue System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chaebin Lee</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+S">Seungyeon Seo</a>, <a href="/search/cs?searchtype=author&query=Do%2C+H">Heejin Do</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21836v1-abstract-short" style="display: inline;"> With the advancement of chatbots and the growing demand for automatic depression detection, identifying depression in patient conversations has gained more attention. However, prior methods often assess depression in a binary way or only a single score without diverse feedback and lack focus on enhancing dialogue responses. In this paper, we present a novel task of multi-aspect depression severity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21836v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21836v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21836v1-abstract-full" style="display: none;"> With the advancement of chatbots and the growing demand for automatic depression detection, identifying depression in patient conversations has gained more attention. However, prior methods often assess depression in a binary way or only a single score without diverse feedback and lack focus on enhancing dialogue responses. In this paper, we present a novel task of multi-aspect depression severity assessment via an inductive dialogue system (MaDSA), evaluating a patient's depression level on multiple criteria by incorporating an assessment-aided response generation. Further, we propose a foundational system for MaDSA, which induces psychological dialogue responses with an auxiliary emotion classification task within a hierarchical severity assessment structure. We synthesize the conversational dataset annotated with eight aspects of depression severity alongside emotion labels, proven robust via human evaluations. Experimental results show potential for our preliminary work on MaDSA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21836v1-abstract-full').style.display = 'none'; document.getElementById('2410.21836v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21566">arXiv:2410.21566</a> <span> [<a href="https://arxiv.org/pdf/2410.21566">pdf</a>, <a href="https://arxiv.org/format/2410.21566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MVSDet: Multi-View Indoor 3D Object Detection via Efficient Plane Sweeps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yating Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21566v1-abstract-short" style="display: inline;"> The key challenge of multi-view indoor 3D object detection is to infer accurate geometry information from images for precise 3D detection. Previous method relies on NeRF for geometry reasoning. However, the geometry extracted from NeRF is generally inaccurate, which leads to sub-optimal detection performance. In this paper, we propose MVSDet which utilizes plane sweep for geometry-aware 3D object… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21566v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21566v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21566v1-abstract-full" style="display: none;"> The key challenge of multi-view indoor 3D object detection is to infer accurate geometry information from images for precise 3D detection. Previous method relies on NeRF for geometry reasoning. However, the geometry extracted from NeRF is generally inaccurate, which leads to sub-optimal detection performance. In this paper, we propose MVSDet which utilizes plane sweep for geometry-aware 3D object detection. To circumvent the requirement for a large number of depth planes for accurate depth prediction, we design a probabilistic sampling and soft weighting mechanism to decide the placement of pixel features on the 3D volume. We select multiple locations that score top in the probability volume for each pixel and use their probability score to indicate the confidence. We further apply recent pixel-aligned Gaussian Splatting to regularize depth prediction and improve detection performance with little computation overhead. Extensive experiments on ScanNet and ARKitScenes datasets are conducted to show the superiority of our model. Our code is available at https://github.com/Pixie8888/MVSDet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21566v1-abstract-full').style.display = 'none'; document.getElementById('2410.21566v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21353">arXiv:2410.21353</a> <span> [<a href="https://arxiv.org/pdf/2410.21353">pdf</a>, <a href="https://arxiv.org/format/2410.21353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Causal Interventions on Causal Paths: Mapping GPT-2's Reasoning From Syntax to Semantics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+I">Isabelle Lee</a>, <a href="/search/cs?searchtype=author&query=Lum%2C+J">Joshua Lum</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziyi Liu</a>, <a href="/search/cs?searchtype=author&query=Yogatama%2C+D">Dani Yogatama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21353v1-abstract-short" style="display: inline;"> While interpretability research has shed light on some internal algorithms utilized by transformer-based LLMs, reasoning in natural language, with its deep contextuality and ambiguity, defies easy categorization. As a result, formulating clear and motivating questions for circuit analysis that rely on well-defined in-domain and out-of-domain examples required for causal interventions is challengin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21353v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21353v1-abstract-full" style="display: none;"> While interpretability research has shed light on some internal algorithms utilized by transformer-based LLMs, reasoning in natural language, with its deep contextuality and ambiguity, defies easy categorization. As a result, formulating clear and motivating questions for circuit analysis that rely on well-defined in-domain and out-of-domain examples required for causal interventions is challenging. Although significant work has investigated circuits for specific tasks, such as indirect object identification (IOI), deciphering natural language reasoning through circuits remains difficult due to its inherent complexity. In this work, we take initial steps to characterize causal reasoning in LLMs by analyzing clear-cut cause-and-effect sentences like "I opened an umbrella because it started raining," where causal interventions may be possible through carefully crafted scenarios using GPT-2 small. Our findings indicate that causal syntax is localized within the first 2-3 layers, while certain heads in later layers exhibit heightened sensitivity to nonsensical variations of causal sentences. This suggests that models may infer reasoning by (1) detecting syntactic cues and (2) isolating distinct heads in the final layers that focus on semantic relationships. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21353v1-abstract-full').style.display = 'none'; document.getElementById('2410.21353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21287">arXiv:2410.21287</a> <span> [<a href="https://arxiv.org/pdf/2410.21287">pdf</a>, <a href="https://arxiv.org/format/2410.21287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Systematic Assessment of OpenAI o1-Preview for Higher Order Thinking in Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Latif%2C+E">Ehsan Latif</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuchen Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yizhu Gao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lehong Shi</a>, <a href="/search/cs?searchtype=author&query=Nayaaba%2C+M">Matthew Nayaaba</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gyeonggeon Lee</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&query=Bewersdorff%2C+A">Arne Bewersdorff</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Luyang Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiantong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huaqin Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoran Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaxi Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jichao Yu</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Weihang You</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+V+S">Vincent Shung Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jin Lu</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+F">Fei Dou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+P">Ping Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ninghao Liu</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21287v1-abstract-short" style="display: inline;"> As artificial intelligence (AI) continues to advance, it demonstrates capabilities comparable to human intelligence, with significant potential to transform education and workforce development. This study evaluates OpenAI o1-preview's ability to perform higher-order cognitive tasks across 14 dimensions, including critical thinking, systems thinking, computational thinking, design thinking, metacog… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21287v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21287v1-abstract-full" style="display: none;"> As artificial intelligence (AI) continues to advance, it demonstrates capabilities comparable to human intelligence, with significant potential to transform education and workforce development. This study evaluates OpenAI o1-preview's ability to perform higher-order cognitive tasks across 14 dimensions, including critical thinking, systems thinking, computational thinking, design thinking, metacognition, data literacy, creative thinking, abstract reasoning, quantitative reasoning, logical reasoning, analogical reasoning, and scientific reasoning. We used validated instruments like the Ennis-Weir Critical Thinking Essay Test and the Biological Systems Thinking Test to compare the o1-preview's performance with human performance systematically. Our findings reveal that o1-preview outperforms humans in most categories, achieving 150% better results in systems thinking, computational thinking, data literacy, creative thinking, scientific reasoning, and abstract reasoning. However, compared to humans, it underperforms by around 25% in logical reasoning, critical thinking, and quantitative reasoning. In analogical reasoning, both o1-preview and humans achieved perfect scores. Despite these strengths, the o1-preview shows limitations in abstract reasoning, where human psychology students outperform it, highlighting the continued importance of human oversight in tasks requiring high-level abstraction. These results have significant educational implications, suggesting a shift toward developing human skills that complement AI, such as creativity, abstract reasoning, and critical thinking. This study emphasizes the transformative potential of AI in education and calls for a recalibration of educational goals, teaching methods, and curricula to align with an AI-driven world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21287v1-abstract-full').style.display = 'none'; document.getElementById('2410.21287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">An assessment of OpenAI o1-Preview for Higher Order Thinking in Education</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21276">arXiv:2410.21276</a> <span> [<a href="https://arxiv.org/pdf/2410.21276">pdf</a>, <a href="https://arxiv.org/format/2410.21276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GPT-4o System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Hurst%2C+A">Aaron Hurst</a>, <a href="/search/cs?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&query=Goucher%2C+A+P">Adam P. Goucher</a>, <a href="/search/cs?searchtype=author&query=Perelman%2C+A">Adam Perelman</a>, <a href="/search/cs?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+A">Aidan Clark</a>, <a href="/search/cs?searchtype=author&query=Ostrow%2C+A">AJ Ostrow</a>, <a href="/search/cs?searchtype=author&query=Welihinda%2C+A">Akila Welihinda</a>, <a href="/search/cs?searchtype=author&query=Hayes%2C+A">Alan Hayes</a>, <a href="/search/cs?searchtype=author&query=Radford%2C+A">Alec Radford</a>, <a href="/search/cs?searchtype=author&query=M%C4%85dry%2C+A">Aleksander M膮dry</a>, <a href="/search/cs?searchtype=author&query=Baker-Whitcomb%2C+A">Alex Baker-Whitcomb</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Borzunov%2C+A">Alex Borzunov</a>, <a href="/search/cs?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&query=Chow%2C+A">Alex Chow</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alex Kirillov</a>, <a href="/search/cs?searchtype=author&query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/cs?searchtype=author&query=Paino%2C+A">Alex Paino</a>, <a href="/search/cs?searchtype=author&query=Renzin%2C+A">Alex Renzin</a>, <a href="/search/cs?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/cs?searchtype=author&query=Christakis%2C+A">Alexi Christakis</a> , et al. (395 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21276v1-abstract-short" style="display: inline;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 mil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21276v1-abstract-full" style="display: none;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o's capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we've implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o's text and vision capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'none'; document.getElementById('2410.21276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18087">arXiv:2410.18087</a> <span> [<a href="https://arxiv.org/pdf/2410.18087">pdf</a>, <a href="https://arxiv.org/format/2410.18087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CUPID: A Real-Time Session-Based Reciprocal Recommendation System for a One-on-One Social Discovery Platform </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+B">Beomsu Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sangbum Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minchan Kim</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+J">Joonyoung Yi</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sungjoo Ha</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Suhyun Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Youngsoo Lee</a>, <a href="/search/cs?searchtype=author&query=Yeom%2C+G">Gihun Yeom</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+B">Buru Chang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gihun Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18087v1-abstract-short" style="display: inline;"> This study introduces CUPID, a novel approach to session-based reciprocal recommendation systems designed for a real-time one-on-one social discovery platform. In such platforms, low latency is critical to enhance user experiences. However, conventional session-based approaches struggle with high latency due to the demands of modeling sequential user behavior for each recommendation process. Addit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18087v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18087v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18087v1-abstract-full" style="display: none;"> This study introduces CUPID, a novel approach to session-based reciprocal recommendation systems designed for a real-time one-on-one social discovery platform. In such platforms, low latency is critical to enhance user experiences. However, conventional session-based approaches struggle with high latency due to the demands of modeling sequential user behavior for each recommendation process. Additionally, given the reciprocal nature of the platform, where users act as items for each other, training recommendation models on large-scale datasets is computationally prohibitive using conventional methods. To address these challenges, CUPID decouples the time-intensive user session modeling from the real-time user matching process to reduce inference time. Furthermore, CUPID employs a two-phase training strategy that separates the training of embedding and prediction layers, significantly reducing the computational burden by decreasing the number of sequential model inferences by several hundredfold. Extensive experiments on large-scale Azar datasets demonstrate CUPID's effectiveness in a real-world production environment. Notably, CUPID reduces response latency by more than 76% compared to non-asynchronous systems, while significantly improving user engagement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18087v1-abstract-full').style.display = 'none'; document.getElementById('2410.18087v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 2nd International Workshop on User Understanding from Big Data Workshop (DMU2 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17712">arXiv:2410.17712</a> <span> [<a href="https://arxiv.org/pdf/2410.17712">pdf</a>, <a href="https://arxiv.org/format/2410.17712">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Data-Driven Odyssey in Solar Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+D+Y">Do Young Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyunghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gyeongseop Lee</a>, <a href="/search/cs?searchtype=author&query=Das%2C+N">Niloy Das</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seong-Woo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17712v1-abstract-short" style="display: inline;"> Solar vehicles, which simultaneously produce and consume energy, require meticulous energy management. However, potential users often feel uncertain about their operation compared to conventional vehicles. This study presents a simulator designed to help users understand long-distance travel in solar vehicles and recognize the importance of proper energy management. By utilizing Google Maps data a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17712v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17712v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17712v1-abstract-full" style="display: none;"> Solar vehicles, which simultaneously produce and consume energy, require meticulous energy management. However, potential users often feel uncertain about their operation compared to conventional vehicles. This study presents a simulator designed to help users understand long-distance travel in solar vehicles and recognize the importance of proper energy management. By utilizing Google Maps data and weather information, the simulator replicates real-world driving conditions and provides a dashboard displaying vehicle status, updated hourly based on user-inputted speed. Users can explore various speed policy scenarios and receive recommendations for optimal driving strategies. The simulator's effectiveness was validated using the route of the World Solar Challenge (WSC). This research enables users to monitor energy dynamics before a journey, enhancing their understanding of energy management and informing appropriate speed decisions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17712v1-abstract-full').style.display = 'none'; document.getElementById('2410.17712v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14595">arXiv:2410.14595</a> <span> [<a href="https://arxiv.org/pdf/2410.14595">pdf</a>, <a href="https://arxiv.org/ps/2410.14595">ps</a>, <a href="https://arxiv.org/format/2410.14595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DRACO-DehazeNet: An Efficient Image Dehazing Network Combining Detail Recovery and a Novel Contrastive Learning Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+G+Y">Gao Yu Lee</a>, <a href="/search/cs?searchtype=author&query=Dam%2C+T">Tanmoy Dam</a>, <a href="/search/cs?searchtype=author&query=Ferdaus%2C+M+M">Md Meftahul Ferdaus</a>, <a href="/search/cs?searchtype=author&query=Poenar%2C+D+P">Daniel Puiu Poenar</a>, <a href="/search/cs?searchtype=author&query=Duong%2C+V">Vu Duong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14595v1-abstract-short" style="display: inline;"> Image dehazing is crucial for clarifying images obscured by haze or fog, but current learning-based approaches is dependent on large volumes of training data and hence consumed significant computational power. Additionally, their performance is often inadequate under non-uniform or heavy haze. To address these challenges, we developed the Detail Recovery And Contrastive DehazeNet, which facilitate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14595v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14595v1-abstract-full" style="display: none;"> Image dehazing is crucial for clarifying images obscured by haze or fog, but current learning-based approaches is dependent on large volumes of training data and hence consumed significant computational power. Additionally, their performance is often inadequate under non-uniform or heavy haze. To address these challenges, we developed the Detail Recovery And Contrastive DehazeNet, which facilitates efficient and effective dehazing via a dense dilated inverted residual block and an attention-based detail recovery network that tailors enhancements to specific dehazed scene contexts. A major innovation is its ability to train effectively with limited data, achieved through a novel quadruplet loss-based contrastive dehazing paradigm. This approach distinctly separates hazy and clear image features while also distinguish lower-quality and higher-quality dehazed images obtained from each sub-modules of our network, thereby refining the dehazing process to a larger extent. Extensive tests on a variety of benchmarked haze datasets demonstrated the superiority of our approach. The code repository for this work will be available soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14595v1-abstract-full').style.display = 'none'; document.getElementById('2410.14595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to a journal and currently under review. Once the paper is accepted and published, the copyright will be transferred to the corresponding journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13116">arXiv:2410.13116</a> <span> [<a href="https://arxiv.org/pdf/2410.13116">pdf</a>, <a href="https://arxiv.org/format/2410.13116">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning to Summarize from LLM-generated Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+H">Hwanjun Song</a>, <a href="/search/cs?searchtype=author&query=Yun%2C+T">Taewon Yun</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yuho Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gihun Lee</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jason Cai</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13116v1-abstract-short" style="display: inline;"> Developing effective text summarizers remains a challenge due to issues like hallucinations, key information omissions, and verbosity in LLM-generated summaries. This work explores using LLM-generated feedback to improve summary quality by aligning the summaries with human preferences for faithfulness, completeness, and conciseness. We introduce FeedSum, a large-scale dataset containing multi-dime… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13116v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13116v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13116v1-abstract-full" style="display: none;"> Developing effective text summarizers remains a challenge due to issues like hallucinations, key information omissions, and verbosity in LLM-generated summaries. This work explores using LLM-generated feedback to improve summary quality by aligning the summaries with human preferences for faithfulness, completeness, and conciseness. We introduce FeedSum, a large-scale dataset containing multi-dimensional LLM feedback on summaries of varying quality across diverse domains. Our experiments show how feedback quality, dimensionality, and granularity influence preference learning, revealing that high-quality, multi-dimensional, fine-grained feedback significantly improves summary generation. We also compare two methods for using this feedback: supervised fine-tuning and direct preference optimization. Finally, we introduce SummLlama3-8b, a model that outperforms the nearly 10x larger Llama3-70b-instruct in generating human-preferred summaries, demonstrating that smaller models can achieve superior performance with appropriate training. The full dataset will be released soon. The SummLlama3-8B model is now available at https://huggingface.co/DISLab/SummLlama3-8B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13116v1-abstract-full').style.display = 'none'; document.getElementById('2410.13116v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09780">arXiv:2410.09780</a> <span> [<a href="https://arxiv.org/pdf/2410.09780">pdf</a>, <a href="https://arxiv.org/format/2410.09780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Expanding Search Space with Diverse Prompting Agents: An Efficient Sampling Approach for LLM Mathematical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gisang Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sangwoo Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Junyoung Park</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+A">Andrew Chung</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sieun Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+Y">Yoonah Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+B">Byungju Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Min-gyu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09780v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have exhibited remarkable capabilities in many complex tasks including mathematical reasoning. However, traditional approaches heavily rely on ensuring self-consistency within single prompting method, which limits the exploration of diverse problem-solving strategies. This study addresses these limitations by performing an experimental analysis of distinct prompting me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09780v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09780v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have exhibited remarkable capabilities in many complex tasks including mathematical reasoning. However, traditional approaches heavily rely on ensuring self-consistency within single prompting method, which limits the exploration of diverse problem-solving strategies. This study addresses these limitations by performing an experimental analysis of distinct prompting methods within the domain of mathematical reasoning. Our findings demonstrate that each method explores a distinct search space, and this differentiation becomes more evident with increasing problem complexity. To leverage this phenomenon, we applied efficient sampling process that uniformly combines samples from these diverse methods, which not only expands the maximum search space but achieves higher performance with fewer runs compared to single methods. Especially, within the subset of difficult questions of MATH dataset named MATH-hard, The maximum search space was achieved while utilizing approximately 43% fewer runs than single methods on average. These findings highlight the importance of integrating diverse problem-solving strategies to enhance the reasoning abilities of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09780v1-abstract-full').style.display = 'none'; document.getElementById('2410.09780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03197">arXiv:2410.03197</a> <span> [<a href="https://arxiv.org/pdf/2410.03197">pdf</a>, <a href="https://arxiv.org/format/2410.03197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Cross-lingual Transfer for Automatic Question Generation by Learning Interrogative Structures in Target Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hwang%2C+S">Seonjeong Hwang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yunsu Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03197v1-abstract-short" style="display: inline;"> Automatic question generation (QG) serves a wide range of purposes, such as augmenting question-answering (QA) corpora, enhancing chatbot systems, and developing educational materials. Despite its importance, most existing datasets predominantly focus on English, resulting in a considerable gap in data availability for other languages. Cross-lingual transfer for QG (XLT-QG) addresses this limitati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03197v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03197v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03197v1-abstract-full" style="display: none;"> Automatic question generation (QG) serves a wide range of purposes, such as augmenting question-answering (QA) corpora, enhancing chatbot systems, and developing educational materials. Despite its importance, most existing datasets predominantly focus on English, resulting in a considerable gap in data availability for other languages. Cross-lingual transfer for QG (XLT-QG) addresses this limitation by allowing models trained on high-resource language datasets to generate questions in low-resource languages. In this paper, we propose a simple and efficient XLT-QG method that operates without the need for monolingual, parallel, or labeled data in the target language, utilizing a small language model. Our model, trained solely on English QA datasets, learns interrogative structures from a limited set of question exemplars, which are then applied to generate questions in the target language. Experimental results show that our method outperforms several XLT-QG baselines and achieves performance comparable to GPT-3.5-turbo across different languages. Additionally, the synthetic data generated by our model proves beneficial for training multilingual QA models. With significantly fewer parameters than large language models and without requiring additional training for target languages, our approach offers an effective solution for QG and QA tasks across various languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03197v1-abstract-full').style.display = 'none'; document.getElementById('2410.03197v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01190">arXiv:2410.01190</a> <span> [<a href="https://arxiv.org/pdf/2410.01190">pdf</a>, <a href="https://arxiv.org/format/2410.01190">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> </div> </div> <p class="title is-5 mathjax"> Integrating Visual and Textual Inputs for Searching Large-Scale Map Collections with CLIP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mahowald%2C+J">Jamie Mahowald</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B+C+G">Benjamin Charles Germain Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01190v1-abstract-short" style="display: inline;"> Despite the prevalence and historical importance of maps in digital collections, current methods of navigating and exploring map collections are largely restricted to catalog records and structured metadata. In this paper, we explore the potential for interactively searching large-scale map collections using natural language inputs ("maps with sea monsters"), visual inputs (i.e., reverse image sea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01190v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01190v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01190v1-abstract-full" style="display: none;"> Despite the prevalence and historical importance of maps in digital collections, current methods of navigating and exploring map collections are largely restricted to catalog records and structured metadata. In this paper, we explore the potential for interactively searching large-scale map collections using natural language inputs ("maps with sea monsters"), visual inputs (i.e., reverse image search), and multimodal inputs (an example map + "more grayscale"). As a case study, we adopt 562,842 images of maps publicly accessible via the Library of Congress's API. To accomplish this, we use the mulitmodal Contrastive Language-Image Pre-training (CLIP) machine learning model to generate embeddings for these maps, and we develop code to implement exploratory search capabilities with these input strategies. We present results for example searches created in consultation with staff in the Library of Congress's Geography and Map Division and describe the strengths, weaknesses, and possibilities for these search queries. Moreover, we introduce a fine-tuning dataset of 10,504 map-caption pairs, along with an architecture for fine-tuning a CLIP model on this dataset. To facilitate re-use, we provide all of our code in documented, interactive Jupyter notebooks and place all code into the public domain. Lastly, we discuss the opportunities and challenges for applying these approaches across both digitized and born-digital collections held by galleries, libraries, archives, and museums. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01190v1-abstract-full').style.display = 'none'; document.getElementById('2410.01190v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 7 figures, accepted at the Computational Humanities Research Conference (CHR 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00513">arXiv:2410.00513</a> <span> [<a href="https://arxiv.org/pdf/2410.00513">pdf</a>, <a href="https://arxiv.org/format/2410.00513">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Cross-lingual Back-Parsing: Utterance Synthesis from Meaning Representation for Zero-Resource Semantic Parsing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+D">Deokhyung Kang</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S">Seonjeong Hwang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yunsu Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00513v1-abstract-short" style="display: inline;"> Recent efforts have aimed to utilize multilingual pretrained language models (mPLMs) to extend semantic parsing (SP) across multiple languages without requiring extensive annotations. However, achieving zero-shot cross-lingual transfer for SP remains challenging, leading to a performance gap between source and target languages. In this study, we propose Cross-Lingual Back-Parsing (CBP), a novel da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00513v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00513v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00513v1-abstract-full" style="display: none;"> Recent efforts have aimed to utilize multilingual pretrained language models (mPLMs) to extend semantic parsing (SP) across multiple languages without requiring extensive annotations. However, achieving zero-shot cross-lingual transfer for SP remains challenging, leading to a performance gap between source and target languages. In this study, we propose Cross-Lingual Back-Parsing (CBP), a novel data augmentation methodology designed to enhance cross-lingual transfer for SP. Leveraging the representation geometry of the mPLMs, CBP synthesizes target language utterances from source meaning representations. Our methodology effectively performs cross-lingual data augmentation in challenging zero-resource settings, by utilizing only labeled data in the source language and monolingual corpora. Extensive experiments on two cross-language SP benchmarks (Mschema2QA and Xspider) demonstrate that CBP brings substantial gains in the target language. Further analysis of the synthesized utterances shows that our method successfully generates target language utterances with high slot value alignment rates while preserving semantic integrity. Our codes and data are publicly available at https://github.com/deokhk/CBP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00513v1-abstract-full').style.display = 'none'; document.getElementById('2410.00513v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00414">arXiv:2410.00414</a> <span> [<a href="https://arxiv.org/pdf/2410.00414">pdf</a>, <a href="https://arxiv.org/ps/2410.00414">ps</a>, <a href="https://arxiv.org/format/2410.00414">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Semantic Parsing with Candidate Expressions for Knowledge Base Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nam%2C+D">Daehwan Nam</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00414v2-abstract-short" style="display: inline;"> Semantic parsers convert natural language to logical forms, which can be evaluated on knowledge bases (KBs) to produce denotations. Recent semantic parsers have been developed with sequence-to-sequence (seq2seq) pre-trained language models (PLMs) or large language models, where the models treat logical forms as sequences of tokens. For syntactic and semantic validity, the semantic parsers use gram… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00414v2-abstract-full').style.display = 'inline'; document.getElementById('2410.00414v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00414v2-abstract-full" style="display: none;"> Semantic parsers convert natural language to logical forms, which can be evaluated on knowledge bases (KBs) to produce denotations. Recent semantic parsers have been developed with sequence-to-sequence (seq2seq) pre-trained language models (PLMs) or large language models, where the models treat logical forms as sequences of tokens. For syntactic and semantic validity, the semantic parsers use grammars that enable constrained decoding. However, the grammars lack the ability to utilize large information of KBs, although logical forms contain representations of KB elements, such as entities or relations. In this work, we propose a grammar augmented with candidate expressions for semantic parsing on a large KB with a seq2seq PLM. The grammar defines actions as production rules, and our semantic parser predicts actions during inference under the constraints by types and candidate expressions. We apply the grammar to knowledge base question answering, where the constraints by candidate expressions assist a semantic parser to generate valid KB elements. In experiments on two benchmarks, KQA Pro and Overnight, the constraints by candidate expressions increased the accuracy of our semantic parser, whether it was trained with strong supervision or weak supervision. Our semantic parser achieved state-of-the-art accuracies on KQA Pro and Overnight, and its implementation is publicly available at https://github.com/daehwannam/candexpr-sp.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00414v2-abstract-full').style.display = 'none'; document.getElementById('2410.00414v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18622">arXiv:2409.18622</a> <span> [<a href="https://arxiv.org/pdf/2409.18622">pdf</a>, <a href="https://arxiv.org/format/2409.18622">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio-Based Linguistic Feature Extraction for Enhancing Multi-lingual and Low-Resource Text-to-Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Youngjae Kim</a>, <a href="/search/cs?searchtype=author&query=Jeon%2C+Y">Yejin Jeon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18622v1-abstract-short" style="display: inline;"> The difficulty of acquiring abundant, high-quality data, especially in multi-lingual contexts, has sparked interest in addressing low-resource scenarios. Moreover, current literature rely on fixed expressions from language IDs, which results in the inadequate learning of language representations, and the failure to generate speech in unseen languages. To address these challenges, we propose a nove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18622v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18622v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18622v1-abstract-full" style="display: none;"> The difficulty of acquiring abundant, high-quality data, especially in multi-lingual contexts, has sparked interest in addressing low-resource scenarios. Moreover, current literature rely on fixed expressions from language IDs, which results in the inadequate learning of language representations, and the failure to generate speech in unseen languages. To address these challenges, we propose a novel method that directly extracts linguistic features from audio input while effectively filtering out miscellaneous acoustic information including speaker-specific attributes like timbre. Subjective and objective evaluations affirm the effectiveness of our approach for multi-lingual text-to-speech, and highlight its superiority in low-resource transfer learning for previously unseen language. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18622v1-abstract-full').style.display = 'none'; document.getElementById('2409.18622v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18345">arXiv:2409.18345</a> <span> [<a href="https://arxiv.org/pdf/2409.18345">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> A Generalized LLM-Augmented BIM Framework: Application to a Speech-to-BIM system </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+G">Ghang Lee</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+S">Suhyung Jang</a>, <a href="/search/cs?searchtype=author&query=Hyun%2C+S">Seokho Hyun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18345v1-abstract-short" style="display: inline;"> Performing building information modeling (BIM) tasks is a complex process that imposes a steep learning curve and a heavy cognitive load due to the necessity of remembering sequences of numerous commands. With the rapid advancement of large language models (LLMs), it is foreseeable that BIM tasks, including querying and managing BIM data, 4D and 5D BIM, design compliance checking, or authoring a d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18345v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18345v1-abstract-full" style="display: none;"> Performing building information modeling (BIM) tasks is a complex process that imposes a steep learning curve and a heavy cognitive load due to the necessity of remembering sequences of numerous commands. With the rapid advancement of large language models (LLMs), it is foreseeable that BIM tasks, including querying and managing BIM data, 4D and 5D BIM, design compliance checking, or authoring a design, using written or spoken natural language (i.e., text-to-BIM or speech-to-BIM), will soon supplant traditional graphical user interfaces. This paper proposes a generalized LLM-augmented BIM framework to expedite the development of LLM-enhanced BIM applications by providing a step-by-step development process. The proposed framework consists of six steps: interpret-fill-match-structure-execute-check. The paper demonstrates the applicability of the proposed framework through implementing a speech-to-BIM application, NADIA-S (Natural-language-based Architectural Detailing through Interaction with Artificial Intelligence via Speech), using exterior wall detailing as an example. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18345v1-abstract-full').style.display = 'none'; document.getElementById('2409.18345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proceedings of the 41st International Conference of CIB W78. Marrakech, Morocco, 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> http://itc.scix.net/paper/w78-2024-155 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17988">arXiv:2409.17988</a> <span> [<a href="https://arxiv.org/pdf/2409.17988">pdf</a>, <a href="https://arxiv.org/format/2409.17988">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or Low-light Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Low%2C+W+F">Weng Fei Low</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17988v1-abstract-short" style="display: inline;"> The stark contrast in the design philosophy of an event camera makes it particularly ideal for operating under high-speed, high dynamic range and low-light conditions, where standard cameras underperform. Nonetheless, event cameras still suffer from some amount of motion blur, especially under these challenging conditions, in contrary to what most think. This is attributed to the limited bandwidth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17988v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17988v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17988v1-abstract-full" style="display: none;"> The stark contrast in the design philosophy of an event camera makes it particularly ideal for operating under high-speed, high dynamic range and low-light conditions, where standard cameras underperform. Nonetheless, event cameras still suffer from some amount of motion blur, especially under these challenging conditions, in contrary to what most think. This is attributed to the limited bandwidth of the event sensor pixel, which is mostly proportional to the light intensity. Thus, to ensure that event cameras can truly excel in such conditions where it has an edge over standard cameras, it is crucial to account for event motion blur in downstream applications, especially reconstruction. However, none of the recent works on reconstructing Neural Radiance Fields (NeRFs) from events, nor event simulators, have considered the full effects of event motion blur. To this end, we propose, Deblur e-NeRF, a novel method to directly and effectively reconstruct blur-minimal NeRFs from motion-blurred events generated under high-speed motion or low-light conditions. The core component of this work is a physically-accurate pixel bandwidth model proposed to account for event motion blur under arbitrary speed and lighting conditions. We also introduce a novel threshold-normalized total variation loss to improve the regularization of large textureless patches. Experiments on real and novel realistically simulated sequences verify our effectiveness. Our code, event simulator and synthetic event dataset will be open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17988v1-abstract-full').style.display = 'none'; document.getElementById('2409.17988v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024. Project website is accessible at https://wengflow.github.io/deblur-e-nerf</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17472">arXiv:2409.17472</a> <span> [<a href="https://arxiv.org/pdf/2409.17472">pdf</a>, <a href="https://arxiv.org/format/2409.17472">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Multi-trait Essay Scoring via Reinforcement Learning with Scoring-aware Multiple Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Do%2C+H">Heejin Do</a>, <a href="/search/cs?searchtype=author&query=Ryu%2C+S">Sangwon Ryu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17472v1-abstract-short" style="display: inline;"> Recent advances in automated essay scoring (AES) have shifted towards evaluating multiple traits to provide enriched feedback. Like typical AES systems, multi-trait AES employs the quadratic weighted kappa (QWK) to measure agreement with human raters, aligning closely with the rating schema; however, its non-differentiable nature prevents its direct use in neural network training. In this paper, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17472v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17472v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17472v1-abstract-full" style="display: none;"> Recent advances in automated essay scoring (AES) have shifted towards evaluating multiple traits to provide enriched feedback. Like typical AES systems, multi-trait AES employs the quadratic weighted kappa (QWK) to measure agreement with human raters, aligning closely with the rating schema; however, its non-differentiable nature prevents its direct use in neural network training. In this paper, we propose Scoring-aware Multi-reward Reinforcement Learning (SaMRL), which integrates actual evaluation schemes into the training process by designing QWK-based rewards with a mean-squared error penalty for multi-trait AES. Existing reinforcement learning (RL) applications in AES are limited to classification models despite associated performance degradation, as RL requires probability distributions; instead, we adopt an autoregressive score generation framework to leverage token generation probabilities for robust multi-trait score predictions. Empirical analyses demonstrate that SaMRL facilitates model training, notably enhancing scoring of previously inferior prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17472v1-abstract-full').style.display = 'none'; document.getElementById('2409.17472v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15755">arXiv:2409.15755</a> <span> [<a href="https://arxiv.org/pdf/2409.15755">pdf</a>, <a href="https://arxiv.org/format/2409.15755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Stage-Wise Reward Shaping for Acrobatic Robots: A Constrained Multi-Objective Reinforcement Learning Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dohyeong Kim</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+H">Hyeokjin Kwon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junseok Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gunmin Lee</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S">Songhwai Oh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15755v1-abstract-short" style="display: inline;"> As the complexity of tasks addressed through reinforcement learning (RL) increases, the definition of reward functions also has become highly complicated. We introduce an RL method aimed at simplifying the reward-shaping process through intuitive strategies. Initially, instead of a single reward function composed of various terms, we define multiple reward and cost functions within a constrained m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15755v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15755v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15755v1-abstract-full" style="display: none;"> As the complexity of tasks addressed through reinforcement learning (RL) increases, the definition of reward functions also has become highly complicated. We introduce an RL method aimed at simplifying the reward-shaping process through intuitive strategies. Initially, instead of a single reward function composed of various terms, we define multiple reward and cost functions within a constrained multi-objective RL (CMORL) framework. For tasks involving sequential complex movements, we segment the task into distinct stages and define multiple rewards and costs for each stage. Finally, we introduce a practical CMORL algorithm that maximizes objectives based on these rewards while satisfying constraints defined by the costs. The proposed method has been successfully demonstrated across a variety of acrobatic tasks in both simulation and real-world environments. Additionally, it has been shown to successfully perform tasks compared to existing RL and constrained RL algorithms. Our code is available at https://github.com/rllab-snu/Stage-Wise-CMORL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15755v1-abstract-full').style.display = 'none'; document.getElementById('2409.15755v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14987">arXiv:2409.14987</a> <span> [<a href="https://arxiv.org/pdf/2409.14987">pdf</a>, <a href="https://arxiv.org/format/2409.14987">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> A Comparative Quality Metric for Untargeted Fuzzing with Logic State Coverage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gwangmu Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14987v1-abstract-short" style="display: inline;"> While fuzzing is widely accepted as an efficient program testing technique, it is still unclear how to measure the comparative quality of different fuzzers. The current de facto quality metrics are edge coverage and the number of discovered bugs, but they are frequently discredited by inconclusive, exaggerated, or even counter-intuitive results. To establish a more reliable quality metric, we fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14987v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14987v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14987v1-abstract-full" style="display: none;"> While fuzzing is widely accepted as an efficient program testing technique, it is still unclear how to measure the comparative quality of different fuzzers. The current de facto quality metrics are edge coverage and the number of discovered bugs, but they are frequently discredited by inconclusive, exaggerated, or even counter-intuitive results. To establish a more reliable quality metric, we first note that fuzzing aims to reduce the number of unknown abnormal behaviors by observing more interesting (i.e., relating to unknown abnormal) behaviors. The more interesting behaviors a fuzzer has observed, the stronger guarantee it can provide about the absence of unknown abnormal behaviors. This suggests that the number of observed interesting behaviors must directly indicate the fuzzing quality. In this work, we propose logic state coverage as a proxy metric to count observed interesting behaviors. A logic state is a set of satisfied branches during one execution, where its coverage is the count of individual observed logic states during a fuzzing campaign. A logic state distinguishes less repetitive (i.e., more interesting) behaviors in a finer granularity, making the amount of logic state coverage reliably proportional to the number of observed interesting behaviors. We implemented logic state coverage using a bloom filter and performed a preliminary evaluation with AFL++ and XMLLint. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14987v1-abstract-full').style.display = 'none'; document.getElementById('2409.14987v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.5; F.3.2; G.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13580">arXiv:2409.13580</a> <span> [<a href="https://arxiv.org/pdf/2409.13580">pdf</a>, <a href="https://arxiv.org/format/2409.13580">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Lyapunov-guided Deep Reinforcement Learning for Semantic-aware AoI Minimization in UAV-assisted Wireless Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+Y">Yusi Long</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+S">Shimin Gong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Sumei Sun</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gary Lee</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lanhua Li</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13580v1-abstract-short" style="display: inline;"> This paper investigates an unmanned aerial vehicle (UAV)-assisted semantic network where the ground users (GUs) periodically capture and upload the sensing information to a base station (BS) via UAVs' relaying. Both the GUs and the UAVs can extract semantic information from large-size raw data and transmit it to the BS for recovery. Smaller-size semantic information reduces latency and improves in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13580v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13580v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13580v1-abstract-full" style="display: none;"> This paper investigates an unmanned aerial vehicle (UAV)-assisted semantic network where the ground users (GUs) periodically capture and upload the sensing information to a base station (BS) via UAVs' relaying. Both the GUs and the UAVs can extract semantic information from large-size raw data and transmit it to the BS for recovery. Smaller-size semantic information reduces latency and improves information freshness, while larger-size semantic information enables more accurate data reconstruction at the BS, preserving the value of original information. We introduce a novel semantic-aware age-of-information (SAoI) metric to capture both information freshness and semantic importance, and then formulate a time-averaged SAoI minimization problem by jointly optimizing the UAV-GU association, the semantic extraction, and the UAVs' trajectories. We decouple the original problem into a series of subproblems via the Lyapunov framework and then use hierarchical deep reinforcement learning (DRL) to solve each subproblem. Specifically, the UAV-GU association is determined by DRL, followed by the optimization module updating the semantic extraction strategy and UAVs' deployment. Simulation results show that the hierarchical structure improves learning efficiency. Moreover, it achieves low AoI through semantic extraction while ensuring minimal loss of original information, outperforming the existing baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13580v1-abstract-full').style.display = 'none'; document.getElementById('2409.13580v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been sumitted to IEEE TWC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10244">arXiv:2409.10244</a> <span> [<a href="https://arxiv.org/pdf/2409.10244">pdf</a>, <a href="https://arxiv.org/format/2409.10244">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> ES-KT-24: A Multimodal Knowledge Tracing Benchmark Dataset with Educational Game Playing Video and Synthetic Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dohee Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+U">Unggi Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sookbun Lee</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+J">Jiyeong Bae</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+T">Taekyung Ahn</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaekwon Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gunho Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyeoncheol Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10244v1-abstract-short" style="display: inline;"> This paper introduces ES-KT-24, a novel multimodal Knowledge Tracing (KT) dataset for intelligent tutoring systems in educational game contexts. Although KT is crucial in adaptive learning, existing datasets often lack game-based and multimodal elements. ES-KT-24 addresses these limitations by incorporating educational game-playing videos, synthetically generated question text, and detailed game l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10244v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10244v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10244v1-abstract-full" style="display: none;"> This paper introduces ES-KT-24, a novel multimodal Knowledge Tracing (KT) dataset for intelligent tutoring systems in educational game contexts. Although KT is crucial in adaptive learning, existing datasets often lack game-based and multimodal elements. ES-KT-24 addresses these limitations by incorporating educational game-playing videos, synthetically generated question text, and detailed game logs. The dataset covers Mathematics, English, Indonesian, and Malaysian subjects, emphasizing diversity and including non-English content. The synthetic text component, generated using a large language model, encompasses 28 distinct knowledge concepts and 182 questions, featuring 15,032 users and 7,782,928 interactions. Our benchmark experiments demonstrate the dataset's utility for KT research by comparing Deep learning-based KT models with Language Model-based Knowledge Tracing (LKT) approaches. Notably, LKT models showed slightly higher performance than traditional DKT models, highlighting the potential of language model-based approaches in this field. Furthermore, ES-KT-24 has the potential to significantly advance research in multimodal KT models and learning analytics. By integrating game-playing videos and detailed game logs, this dataset offers a unique approach to dissecting student learning patterns through advanced data analysis and machine-learning techniques. It has the potential to unearth new insights into the learning process and inspire further exploration in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10244v1-abstract-full').style.display = 'none'; document.getElementById('2409.10244v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10172">arXiv:2409.10172</a> <span> [<a href="https://arxiv.org/pdf/2409.10172">pdf</a>, <a href="https://arxiv.org/format/2409.10172">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LiLoc: Lifelong Localization using Adaptive Submap Joining and Egocentric Factor Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yixin Fang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanyan Li</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+K">Kun Qian</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10172v1-abstract-short" style="display: inline;"> This paper proposes a versatile graph-based lifelong localization framework, LiLoc, which enhances its timeliness by maintaining a single central session while improves the accuracy through multi-modal factors between the central and subsidiary sessions. First, an adaptive submap joining strategy is employed to generate prior submaps (keyframes and poses) for the central session, and to provide pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10172v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10172v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10172v1-abstract-full" style="display: none;"> This paper proposes a versatile graph-based lifelong localization framework, LiLoc, which enhances its timeliness by maintaining a single central session while improves the accuracy through multi-modal factors between the central and subsidiary sessions. First, an adaptive submap joining strategy is employed to generate prior submaps (keyframes and poses) for the central session, and to provide priors for subsidiaries when constraints are needed for robust localization. Next, a coarse-to-fine pose initialization for subsidiary sessions is performed using vertical recognition and ICP refinement in the global coordinate frame. To elevate the accuracy of subsequent localization, we propose an egocentric factor graph (EFG) module that integrates the IMU preintegration, LiDAR odometry and scan match factors in a joint optimization manner. Specifically, the scan match factors are constructed by a novel propagation model that efficiently distributes the prior constrains as edges to the relevant prior pose nodes, weighted by noises based on keyframe registration errors. Additionally, the framework supports flexible switching between two modes: relocalization (RLM) and incremental localization (ILM) based on the proposed overlap-based mechanism to select or update the prior submaps from central session. The proposed LiLoc is tested on public and custom datasets, demonstrating accurate localization performance against state-of-the-art methods. Our codes will be publicly available on https://github.com/Yixin-F/LiLoc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10172v1-abstract-full').style.display = 'none'; document.getElementById('2409.10172v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08839">arXiv:2409.08839</a> <span> [<a href="https://arxiv.org/pdf/2409.08839">pdf</a>, <a href="https://arxiv.org/format/2409.08839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RF Challenge: The Data-Driven Radio Frequency Signal Separation Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lancho%2C+A">Alejandro Lancho</a>, <a href="/search/cs?searchtype=author&query=Weiss%2C+A">Amir Weiss</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+C+F">Gary C. F. Lee</a>, <a href="/search/cs?searchtype=author&query=Jayashankar%2C+T">Tejas Jayashankar</a>, <a href="/search/cs?searchtype=author&query=Kurien%2C+B">Binoy Kurien</a>, <a href="/search/cs?searchtype=author&query=Polyanskiy%2C+Y">Yury Polyanskiy</a>, <a href="/search/cs?searchtype=author&query=Wornell%2C+G+W">Gregory W. Wornell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08839v1-abstract-short" style="display: inline;"> This paper addresses the critical problem of interference rejection in radio-frequency (RF) signals using a novel, data-driven approach that leverages state-of-the-art AI models. Traditionally, interference rejection algorithms are manually tailored to specific types of interference. This work introduces a more scalable data-driven solution and contains the following contributions. First, we prese… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08839v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08839v1-abstract-full" style="display: none;"> This paper addresses the critical problem of interference rejection in radio-frequency (RF) signals using a novel, data-driven approach that leverages state-of-the-art AI models. Traditionally, interference rejection algorithms are manually tailored to specific types of interference. This work introduces a more scalable data-driven solution and contains the following contributions. First, we present an insightful signal model that serves as a foundation for developing and analyzing interference rejection algorithms. Second, we introduce the RF Challenge, a publicly available dataset featuring diverse RF signals along with code templates, which facilitates data-driven analysis of RF signal problems. Third, we propose novel AI-based rejection algorithms, specifically architectures like UNet and WaveNet, and evaluate their performance across eight different signal mixture types. These models demonstrate superior performance exceeding traditional methods like matched filtering and linear minimum mean square error estimation by up to two orders of magnitude in bit-error rate. Fourth, we summarize the results from an open competition hosted at 2024 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2024) based on the RF Challenge, highlighting the significant potential for continued advancements in this area. Our findings underscore the promise of deep learning algorithms in mitigating interference, offering a strong foundation for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08839v1-abstract-full').style.display = 'none'; document.getElementById('2409.08839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 12 figures, submitted to the IEEE Open Journal of the Communications Society</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07088">arXiv:2409.07088</a> <span> [<a href="https://arxiv.org/pdf/2409.07088">pdf</a>, <a href="https://arxiv.org/format/2409.07088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Ontology-Free General-Domain Knowledge Graph-to-Text Generation Dataset Synthesis using Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+D">Daehee Kim</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+D">Deokhyung Kang</a>, <a href="/search/cs?searchtype=author&query=Ryu%2C+S">Sangwon Ryu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07088v1-abstract-short" style="display: inline;"> Knowledge Graph-to-Text (G2T) generation involves verbalizing structured knowledge graphs into natural language text. Recent advancements in Pretrained Language Models (PLMs) have improved G2T performance, but their effectiveness depends on datasets with precise graph-text alignment. However, the scarcity of high-quality, general-domain G2T generation datasets restricts progress in the general-dom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07088v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07088v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07088v1-abstract-full" style="display: none;"> Knowledge Graph-to-Text (G2T) generation involves verbalizing structured knowledge graphs into natural language text. Recent advancements in Pretrained Language Models (PLMs) have improved G2T performance, but their effectiveness depends on datasets with precise graph-text alignment. However, the scarcity of high-quality, general-domain G2T generation datasets restricts progress in the general-domain G2T generation research. To address this issue, we introduce Wikipedia Ontology-Free Graph-text dataset (WikiOFGraph), a new large-scale G2T dataset generated using a novel method that leverages Large Language Model (LLM) and Data-QuestEval. Our new dataset, which contains 5.85M general-domain graph-text pairs, offers high graph-text consistency without relying on external ontologies. Experimental results demonstrate that PLM fine-tuned on WikiOFGraph outperforms those trained on other datasets across various evaluation metrics. Our method proves to be a scalable and effective solution for generating high-quality G2T data, significantly advancing the field of G2T generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07088v1-abstract-full').style.display = 'none'; document.getElementById('2409.07088v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06263">arXiv:2409.06263</a> <span> [<a href="https://arxiv.org/pdf/2409.06263">pdf</a>, <a href="https://arxiv.org/format/2409.06263">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Keyword-Aware ASR Error Augmentation for Robust Dialogue State Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jihyun Lee</a>, <a href="/search/cs?searchtype=author&query=Im%2C+S">Solee Im</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+W">Wonjun Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06263v1-abstract-short" style="display: inline;"> Dialogue State Tracking (DST) is a key part of task-oriented dialogue systems, identifying important information in conversations. However, its accuracy drops significantly in spoken dialogue environments due to named entity errors from Automatic Speech Recognition (ASR) systems. We introduce a simple yet effective data augmentation method that targets those entities to improve the robustness of D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06263v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06263v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06263v1-abstract-full" style="display: none;"> Dialogue State Tracking (DST) is a key part of task-oriented dialogue systems, identifying important information in conversations. However, its accuracy drops significantly in spoken dialogue environments due to named entity errors from Automatic Speech Recognition (ASR) systems. We introduce a simple yet effective data augmentation method that targets those entities to improve the robustness of DST model. Our novel method can control the placement of errors using keyword-highlighted prompts while introducing phonetically similar errors. As a result, our method generated sufficient error patterns on keywords, leading to improved accuracy in noised and low-accuracy ASR environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06263v1-abstract-full').style.display = 'none'; document.getElementById('2409.06263v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06243">arXiv:2409.06243</a> <span> [<a href="https://arxiv.org/pdf/2409.06243">pdf</a>, <a href="https://arxiv.org/format/2409.06243">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Inference is All You Need: Self Example Retriever for Cross-domain Dialogue State Tracking with ChatGPT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jihyun Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06243v1-abstract-short" style="display: inline;"> Traditional dialogue state tracking approaches heavily rely on extensive training data and handcrafted features, limiting their scalability and adaptability to new domains. In this paper, we propose a novel method that leverages inference and in-context learning with ChatGPT for domain transfer in dialogue state tracking, without any parameter updates. By guiding ChatGPT's chain of thought, we ena… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06243v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06243v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06243v1-abstract-full" style="display: none;"> Traditional dialogue state tracking approaches heavily rely on extensive training data and handcrafted features, limiting their scalability and adaptability to new domains. In this paper, we propose a novel method that leverages inference and in-context learning with ChatGPT for domain transfer in dialogue state tracking, without any parameter updates. By guiding ChatGPT's chain of thought, we enable it to retrieve relevant examples and generalize knowledge to accurately infer dialogue states, solely through inference. Experimental results on the MultiWOZ dataset demonstrate competitive performance and promising generalization across domains. Our parameter-free approach offers a scalable and adaptable solution, opening new research directions in domain transfer learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06243v1-abstract-full').style.display = 'none'; document.getElementById('2409.06243v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05277">arXiv:2409.05277</a> <span> [<a href="https://arxiv.org/pdf/2409.05277">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TPAMI.2021.3122444">10.1109/TPAMI.2021.3122444 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Disentangled Representations for Short-Term and Long-Term Person Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Eom%2C+C">Chanho Eom</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+W">Wonkyung Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Geon Lee</a>, <a href="/search/cs?searchtype=author&query=Ham%2C+B">Bumsub Ham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05277v1-abstract-short" style="display: inline;"> We address the problem of person re-identification (reID), that is, retrieving person images from a large dataset, given a query image of the person of interest. A key challenge is to learn person representations robust to intra-class variations, as different persons could have the same attribute, and persons' appearances look different, e.g., with viewpoint changes. Recent reID methods focus on l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05277v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05277v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05277v1-abstract-full" style="display: none;"> We address the problem of person re-identification (reID), that is, retrieving person images from a large dataset, given a query image of the person of interest. A key challenge is to learn person representations robust to intra-class variations, as different persons could have the same attribute, and persons' appearances look different, e.g., with viewpoint changes. Recent reID methods focus on learning person features discriminative only for a particular factor of variations (e.g., human pose), which also requires corresponding supervisory signals (e.g., pose annotations). To tackle this problem, we propose to factorize person images into identity-related and unrelated features. Identity-related features contain information useful for specifying a particular person (e.g., clothing), while identity-unrelated ones hold other factors (e.g., human pose). To this end, we propose a new generative adversarial network, dubbed identity shuffle GAN (IS-GAN). It disentangles identity-related and unrelated features from person images through an identity-shuffling technique that exploits identification labels alone without any auxiliary supervisory signals. We restrict the distribution of identity-unrelated features or encourage the identity-related and unrelated features to be uncorrelated, facilitating the disentanglement process. Experimental results validate the effectiveness of IS-GAN, showing state-of-the-art performance on standard reID benchmarks, including Market-1501, CUHK03, and DukeMTMC-reID. We further demonstrate the advantages of disentangling person representations on a long-term reID task, setting a new state of the art on a Celeb-reID dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05277v1-abstract-full').style.display = 'none'; document.getElementById('2409.05277v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:1910.12003</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE, VOL. 44, NO. 12, DECEMBER 2022 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00323">arXiv:2409.00323</a> <span> [<a href="https://arxiv.org/pdf/2409.00323">pdf</a>, <a href="https://arxiv.org/format/2409.00323">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.13140/RG.2.2.25134.11847">10.13140/RG.2.2.25134.11847 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> From Prediction to Application: Language Model-based Code Knowledge Tracing with Domain Adaptive Pre-Training and Automatic Feedback System with Pedagogical Prompting for Comprehensive Programming Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+U">Unggi Lee</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+J">Jiyeong Bae</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+Y">Yeonji Jung</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+M">Minji Kang</a>, <a href="/search/cs?searchtype=author&query=Byun%2C+G">Gyuri Byun</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yeonseo Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dohee Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sookbun Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaekwon Park</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+T">Taekyung Ahn</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gunho Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyeoncheol Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00323v1-abstract-short" style="display: inline;"> Knowledge Tracing (KT) is a critical component in online learning, but traditional approaches face limitations in interpretability and cross-domain adaptability. This paper introduces Language Model-based Code Knowledge Tracing (CodeLKT), an innovative application of Language model-based Knowledge Tracing (LKT) to programming education. CodeLKT leverages pre-trained language models to process lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00323v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00323v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00323v1-abstract-full" style="display: none;"> Knowledge Tracing (KT) is a critical component in online learning, but traditional approaches face limitations in interpretability and cross-domain adaptability. This paper introduces Language Model-based Code Knowledge Tracing (CodeLKT), an innovative application of Language model-based Knowledge Tracing (LKT) to programming education. CodeLKT leverages pre-trained language models to process learning data, demonstrating superior performance over existing KT and Code KT models. We explore Domain Adaptive Pre-Training (DAPT) and Task Adaptive Pre-Training (TAPT), showing enhanced performance in the coding domain and investigating cross-domain transfer between mathematics and coding. Additionally, we present an theoretically-informed integrated system combining CodeLKT with large language models to generate personalized, in-depth feedback to support students' programming learning. This work advances the field of Code Knowledge Tracing by expanding the knowledge base with language model-based approach and offering practical implications for programming education through data-informed feedback. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00323v1-abstract-full').style.display = 'none'; document.getElementById('2409.00323v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13850">arXiv:2408.13850</a> <span> [<a href="https://arxiv.org/pdf/2408.13850">pdf</a>, <a href="https://arxiv.org/format/2408.13850">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Condensed Sample-Guided Model Inversion for Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Binici%2C+K">Kuluhan Binici</a>, <a href="/search/cs?searchtype=author&query=Aggarwal%2C+S">Shivam Aggarwal</a>, <a href="/search/cs?searchtype=author&query=Acar%2C+C">Cihan Acar</a>, <a href="/search/cs?searchtype=author&query=Pham%2C+N+T">Nam Trung Pham</a>, <a href="/search/cs?searchtype=author&query=Leman%2C+K">Karianto Leman</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+T">Tulika Mitra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13850v1-abstract-short" style="display: inline;"> Knowledge distillation (KD) is a key element in neural network compression that allows knowledge transfer from a pre-trained teacher model to a more compact student model. KD relies on access to the training dataset, which may not always be fully available due to privacy concerns or logistical issues related to the size of the data. To address this, "data-free" KD methods use synthetic data, gener… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13850v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13850v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13850v1-abstract-full" style="display: none;"> Knowledge distillation (KD) is a key element in neural network compression that allows knowledge transfer from a pre-trained teacher model to a more compact student model. KD relies on access to the training dataset, which may not always be fully available due to privacy concerns or logistical issues related to the size of the data. To address this, "data-free" KD methods use synthetic data, generated through model inversion, to mimic the target data distribution. However, conventional model inversion methods are not designed to utilize supplementary information from the target dataset, and thus, cannot leverage it to improve performance, even when it is available. In this paper, we consider condensed samples, as a form of supplementary information, and introduce a method for using them to better approximate the target data distribution, thereby enhancing the KD performance. Our approach is versatile, evidenced by improvements of up to 11.4% in KD accuracy across various datasets and model inversion-based methods. Importantly, it remains effective even when using as few as one condensed sample per class, and can also enhance performance in few-shot scenarios where only limited real data samples are available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13850v1-abstract-full').style.display = 'none'; document.getElementById('2408.13850v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06065">arXiv:2408.06065</a> <span> [<a href="https://arxiv.org/pdf/2408.06065">pdf</a>, <a href="https://arxiv.org/format/2408.06065">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> An Investigation Into Explainable Audio Hate Speech Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+J">Jinmyeong An</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+W">Wonjun Lee</a>, <a href="/search/cs?searchtype=author&query=Jeon%2C+Y">Yejin Jeon</a>, <a href="/search/cs?searchtype=author&query=Ok%2C+J">Jungseul Ok</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yunsu Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06065v1-abstract-short" style="display: inline;"> Research on hate speech has predominantly revolved around detection and interpretation from textual inputs, leaving verbal content largely unexplored. While there has been limited exploration into hate speech detection within verbal acoustic speech inputs, the aspect of interpretability has been overlooked. Therefore, we introduce a new task of explainable audio hate speech detection. Specifically… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06065v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06065v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06065v1-abstract-full" style="display: none;"> Research on hate speech has predominantly revolved around detection and interpretation from textual inputs, leaving verbal content largely unexplored. While there has been limited exploration into hate speech detection within verbal acoustic speech inputs, the aspect of interpretability has been overlooked. Therefore, we introduce a new task of explainable audio hate speech detection. Specifically, we aim to identify the precise time intervals, referred to as audio frame-level rationales, which serve as evidence for hate speech classification. Towards this end, we propose two different approaches: cascading and End-to-End (E2E). The cascading approach initially converts audio to transcripts, identifies hate speech within these transcripts, and subsequently locates the corresponding audio time frames. Conversely, the E2E approach processes audio utterances directly, which allows it to pinpoint hate speech within specific time frames. Additionally, due to the lack of explainable audio hate speech datasets that include audio frame-level rationales, we curated a synthetic audio dataset to train our models. We further validated these models on actual human speech utterances and found that the E2E approach outperforms the cascading method in terms of the audio frame Intersection over Union (IoU) metric. Furthermore, we observed that including frame-level rationales significantly enhances hate speech detection accuracy for the E2E approach. \textbf{Disclaimer} The reader may encounter content of an offensive or hateful nature. However, given the nature of the work, this cannot be avoided. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06065v1-abstract-full').style.display = 'none'; document.getElementById('2408.06065v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SIGDIAL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06044">arXiv:2408.06044</a> <span> [<a href="https://arxiv.org/pdf/2408.06044">pdf</a>, <a href="https://arxiv.org/format/2408.06044">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DiagESC: Dialogue Synthesis for Integrating Depression Diagnosis into Emotional Support Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Seo%2C+S">Seungyeon Seo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06044v1-abstract-short" style="display: inline;"> Dialogue systems for mental health care aim to provide appropriate support to individuals experiencing mental distress. While extensive research has been conducted to deliver adequate emotional support, existing studies cannot identify individuals who require professional medical intervention and cannot offer suitable guidance. We introduce the Diagnostic Emotional Support Conversation task for an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06044v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06044v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06044v1-abstract-full" style="display: none;"> Dialogue systems for mental health care aim to provide appropriate support to individuals experiencing mental distress. While extensive research has been conducted to deliver adequate emotional support, existing studies cannot identify individuals who require professional medical intervention and cannot offer suitable guidance. We introduce the Diagnostic Emotional Support Conversation task for an advanced mental health management system. We develop the DESC dataset to assess depression symptoms while maintaining user experience by utilizing task-specific utterance generation prompts and a strict filtering algorithm. Evaluations by professional psychological counselors indicate that DESC has a superior ability to diagnose depression than existing data. Additionally, conversational quality evaluation reveals that DESC maintains fluent, consistent, and coherent dialogues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06044v1-abstract-full').style.display = 'none'; document.getElementById('2408.06044v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SIGDIAL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06043">arXiv:2408.06043</a> <span> [<a href="https://arxiv.org/pdf/2408.06043">pdf</a>, <a href="https://arxiv.org/format/2408.06043">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Dialogue Speech Recognition with Robust Contextual Awareness via Noise Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+W">Wonjun Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">San Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06043v1-abstract-short" style="display: inline;"> Recent dialogue systems rely on turn-based spoken interactions, requiring accurate Automatic Speech Recognition (ASR). Errors in ASR can significantly impact downstream dialogue tasks. To address this, using dialogue context from user and agent interactions for transcribing subsequent utterances has been proposed. This method incorporates the transcription of the user's speech and the agent's resp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06043v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06043v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06043v1-abstract-full" style="display: none;"> Recent dialogue systems rely on turn-based spoken interactions, requiring accurate Automatic Speech Recognition (ASR). Errors in ASR can significantly impact downstream dialogue tasks. To address this, using dialogue context from user and agent interactions for transcribing subsequent utterances has been proposed. This method incorporates the transcription of the user's speech and the agent's response as model input, using the accumulated context generated by each turn. However, this context is susceptible to ASR errors because it is generated by the ASR model in an auto-regressive fashion. Such noisy context can further degrade the benefits of context input, resulting in suboptimal ASR performance. In this paper, we introduce Context Noise Representation Learning (CNRL) to enhance robustness against noisy context, ultimately improving dialogue speech recognition accuracy. To maximize the advantage of context awareness, our approach includes decoder pre-training using text-based dialogue data and noise representation learning for a context encoder. Based on the evaluation of speech dialogues, our method shows superior results compared to baselines. Furthermore, the strength of our approach is highlighted in noisy environments where user speech is barely audible due to real-world noise, relying on contextual information to transcribe the input accurately. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06043v1-abstract-full').style.display = 'none'; document.getElementById('2408.06043v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 2 figures, Accepted to SIGDIAL2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05927">arXiv:2408.05927</a> <span> [<a href="https://arxiv.org/pdf/2408.05927">pdf</a>, <a href="https://arxiv.org/format/2408.05927">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Simple Early Exiting Framework for Accelerated Sampling in Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Moon%2C+T">Taehong Moon</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+M">Moonseok Choi</a>, <a href="/search/cs?searchtype=author&query=Yun%2C+E">EungGu Yun</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+J">Jongmin Yoon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gayoung Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaewoong Cho</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Juho Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05927v1-abstract-short" style="display: inline;"> Diffusion models have shown remarkable performance in generation problems over various domains including images, videos, text, and audio. A practical bottleneck of diffusion models is their sampling speed, due to the repeated evaluation of score estimation networks during the inference. In this work, we propose a novel framework capable of adaptively allocating compute required for the score estim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05927v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05927v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05927v1-abstract-full" style="display: none;"> Diffusion models have shown remarkable performance in generation problems over various domains including images, videos, text, and audio. A practical bottleneck of diffusion models is their sampling speed, due to the repeated evaluation of score estimation networks during the inference. In this work, we propose a novel framework capable of adaptively allocating compute required for the score estimation, thereby reducing the overall sampling time of diffusion models. We observe that the amount of computation required for the score estimation may vary along the time step for which the score is estimated. Based on this observation, we propose an early-exiting scheme, where we skip the subset of parameters in the score estimation network during the inference, based on a time-dependent exit schedule. Using the diffusion models for image synthesis, we show that our method could significantly improve the sampling throughput of the diffusion models without compromising image quality. Furthermore, we also demonstrate that our method seamlessly integrates with various types of solvers for faster sampling, capitalizing on their compatibility to enhance overall efficiency. The source code and our experiments are available at \url{https://github.com/taehong-moon/ee-diffusion} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05927v1-abstract-full').style.display = 'none'; document.getElementById('2408.05927v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05074">arXiv:2408.05074</a> <span> [<a href="https://arxiv.org/pdf/2408.05074">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RT-Surv: Improving Mortality Prediction After Radiotherapy with Large Language Model Structuring of Large-Scale Unstructured Electronic Health Records </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+S">Sangjoon Park</a>, <a href="/search/cs?searchtype=author&query=Wee%2C+C+W">Chan Woo Wee</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+S+H">Seo Hee Choi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K+H">Kyung Hwan Kim</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+J+S">Jee Suk Chang</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+H+I">Hong In Yoon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+I+J">Ik Jae Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y+B">Yong Bae Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaeho Cho</a>, <a href="/search/cs?searchtype=author&query=Keum%2C+K+C">Ki Chang Keum</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+C+G">Chang Geol Lee</a>, <a href="/search/cs?searchtype=author&query=Byun%2C+H+K">Hwa Kyung Byun</a>, <a href="/search/cs?searchtype=author&query=Koom%2C+W+S">Woong Sub Koom</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05074v4-abstract-short" style="display: inline;"> Accurate patient selection is critical in radiotherapy (RT) to prevent ineffective treatments. Traditional survival prediction models, relying on structured data, often lack precision. This study explores the potential of large language models (LLMs) to structure unstructured electronic health record (EHR) data, thereby improving survival prediction accuracy through comprehensive clinical informat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05074v4-abstract-full').style.display = 'inline'; document.getElementById('2408.05074v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05074v4-abstract-full" style="display: none;"> Accurate patient selection is critical in radiotherapy (RT) to prevent ineffective treatments. Traditional survival prediction models, relying on structured data, often lack precision. This study explores the potential of large language models (LLMs) to structure unstructured electronic health record (EHR) data, thereby improving survival prediction accuracy through comprehensive clinical information integration. Data from 34,276 patients treated with RT at Yonsei Cancer Center between 2013 and 2023 were analyzed, encompassing both structured and unstructured data. An open-source LLM was used to structure the unstructured EHR data via single-shot learning, with its performance compared against a domain-specific medical LLM and a smaller variant. Survival prediction models were developed using statistical, machine learning, and deep learning approaches, incorporating both structured and LLM-structured data. Clinical experts evaluated the accuracy of the LLM-structured data. The open-source LLM achieved 87.5% accuracy in structuring unstructured EHR data without additional training, significantly outperforming the domain-specific medical LLM, which reached only 35.8% accuracy. Larger LLMs were more effective, particularly in extracting clinically relevant features like general condition and disease extent, which closely correlated with patient survival. Incorporating LLM-structured clinical features into survival prediction models significantly improved accuracy, with the C-index of deep learning models increasing from 0.737 to 0.820. These models also became more interpretable by emphasizing clinically significant factors. This study shows that general-domain LLMs, even without specific medical training, can effectively structure large-scale unstructured EHR data, substantially enhancing the accuracy and interpretability of clinical predictive models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05074v4-abstract-full').style.display = 'none'; document.getElementById('2408.05074v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 2 tables, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21783">arXiv:2407.21783</a> <span> [<a href="https://arxiv.org/pdf/2407.21783">pdf</a>, <a href="https://arxiv.org/format/2407.21783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Llama 3 Herd of Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grattafiori%2C+A">Aaron Grattafiori</a>, <a href="/search/cs?searchtype=author&query=Dubey%2C+A">Abhimanyu Dubey</a>, <a href="/search/cs?searchtype=author&query=Jauhri%2C+A">Abhinav Jauhri</a>, <a href="/search/cs?searchtype=author&query=Pandey%2C+A">Abhinav Pandey</a>, <a href="/search/cs?searchtype=author&query=Kadian%2C+A">Abhishek Kadian</a>, <a href="/search/cs?searchtype=author&query=Al-Dahle%2C+A">Ahmad Al-Dahle</a>, <a href="/search/cs?searchtype=author&query=Letman%2C+A">Aiesha Letman</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+A">Akhil Mathur</a>, <a href="/search/cs?searchtype=author&query=Schelten%2C+A">Alan Schelten</a>, <a href="/search/cs?searchtype=author&query=Vaughan%2C+A">Alex Vaughan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">Amy Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+A">Angela Fan</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+A">Anirudh Goyal</a>, <a href="/search/cs?searchtype=author&query=Hartshorn%2C+A">Anthony Hartshorn</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">Aobo Yang</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+A">Archi Mitra</a>, <a href="/search/cs?searchtype=author&query=Sravankumar%2C+A">Archie Sravankumar</a>, <a href="/search/cs?searchtype=author&query=Korenev%2C+A">Artem Korenev</a>, <a href="/search/cs?searchtype=author&query=Hinsvark%2C+A">Arthur Hinsvark</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+A">Arun Rao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aston Zhang</a>, <a href="/search/cs?searchtype=author&query=Rodriguez%2C+A">Aurelien Rodriguez</a>, <a href="/search/cs?searchtype=author&query=Gregerson%2C+A">Austen Gregerson</a>, <a href="/search/cs?searchtype=author&query=Spataru%2C+A">Ava Spataru</a>, <a href="/search/cs?searchtype=author&query=Roziere%2C+B">Baptiste Roziere</a> , et al. (536 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21783v3-abstract-short" style="display: inline;"> Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21783v3-abstract-full').style.display = 'inline'; document.getElementById('2407.21783v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21783v3-abstract-full" style="display: none;"> Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical evaluation of Llama 3. We find that Llama 3 delivers comparable quality to leading language models such as GPT-4 on a plethora of tasks. We publicly release Llama 3, including pre-trained and post-trained versions of the 405B parameter language model and our Llama Guard 3 model for input and output safety. The paper also presents the results of experiments in which we integrate image, video, and speech capabilities into Llama 3 via a compositional approach. We observe this approach performs competitively with the state-of-the-art on image, video, and speech recognition tasks. The resulting models are not yet being broadly released as they are still under development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21783v3-abstract-full').style.display = 'none'; document.getElementById('2407.21783v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21035">arXiv:2407.21035</a> <span> [<a href="https://arxiv.org/pdf/2407.21035">pdf</a>, <a href="https://arxiv.org/format/2407.21035">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Direct Unlearning Optimization for Robust and Safe Text-to-Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+Y">Yong-Hyun Park</a>, <a href="/search/cs?searchtype=author&query=Yun%2C+S">Sangdoo Yun</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jin-Hwa Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junho Kim</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+G">Geonhui Jang</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+Y">Yonghyun Jeong</a>, <a href="/search/cs?searchtype=author&query=Jo%2C+J">Junghyo Jo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gayoung Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21035v1-abstract-short" style="display: inline;"> Recent advancements in text-to-image (T2I) models have greatly benefited from large-scale datasets, but they also pose significant risks due to the potential generation of unsafe content. To mitigate this issue, researchers have developed unlearning techniques to remove the model's ability to generate potentially harmful content. However, these methods are easily bypassed by adversarial attacks, m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21035v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21035v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21035v1-abstract-full" style="display: none;"> Recent advancements in text-to-image (T2I) models have greatly benefited from large-scale datasets, but they also pose significant risks due to the potential generation of unsafe content. To mitigate this issue, researchers have developed unlearning techniques to remove the model's ability to generate potentially harmful content. However, these methods are easily bypassed by adversarial attacks, making them unreliable for ensuring the safety of generated images. In this paper, we propose Direct Unlearning Optimization (DUO), a novel framework for removing Not Safe For Work (NSFW) content from T2I models while preserving their performance on unrelated topics. DUO employs a preference optimization approach using curated paired image data, ensuring that the model learns to remove unsafe visual concepts while retaining unrelated features. Furthermore, we introduce an output-preserving regularization term to maintain the model's generative capabilities on safe content. Extensive experiments demonstrate that DUO can robustly defend against various state-of-the-art red teaming methods without significant performance degradation on unrelated topics, as measured by FID and CLIP scores. Our work contributes to the development of safer and more reliable T2I models, paving the way for their responsible deployment in both closed-source and open-source scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21035v1-abstract-full').style.display = 'none'; document.getElementById('2407.21035v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended abstract accepted in GenLaw 2024 workshop @ ICML2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18328">arXiv:2407.18328</a> <span> [<a href="https://arxiv.org/pdf/2407.18328">pdf</a>, <a href="https://arxiv.org/ps/2407.18328">ps</a>, <a href="https://arxiv.org/format/2407.18328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Unveiling Scoring Processes: Dissecting the Differences between LLMs and Human Graders in Automatic Scoring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xuansheng Wu</a>, <a href="/search/cs?searchtype=author&query=Saraf%2C+P+P">Padmaja Pravin Saraf</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gyeong-Geon Lee</a>, <a href="/search/cs?searchtype=author&query=Latif%2C+E">Ehsan Latif</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ninghao Liu</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaoming Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18328v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated strong potential in performing automatic scoring for constructed response assessments. While constructed responses graded by humans are usually based on given grading rubrics, the methods by which LLMs assign scores remain largely unclear. It is also uncertain how closely AI's scoring process mirrors that of humans, or if it adheres to the same gradin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18328v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18328v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18328v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated strong potential in performing automatic scoring for constructed response assessments. While constructed responses graded by humans are usually based on given grading rubrics, the methods by which LLMs assign scores remain largely unclear. It is also uncertain how closely AI's scoring process mirrors that of humans, or if it adheres to the same grading criteria. To address this gap, this paper uncovers the grading rubrics that LLMs used to score students' written responses to science tasks and their alignment with human scores. We also examine whether enhancing the alignments can improve scoring accuracy. Specifically, we prompt LLMs to generate analytic rubrics that they use to assign scores and study the alignment gap with human grading rubrics. Based on a series of experiments with various configurations of LLM settings, we reveal a notable alignment gap between human and LLM graders. While LLMs can adapt quickly to scoring tasks, they often resort to shortcuts, bypassing deeper logical reasoning expected in human grading. We found that incorporating high-quality analytical rubrics designed to reflect human grading logic can mitigate this gap and enhance LLMs' scoring accuracy. These results caution against the simplistic application of LLMs in science education and highlight the importance of aligning LLM outputs with human expectations to ensure efficient and accurate automatic scoring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18328v1-abstract-full').style.display = 'none'; document.getElementById('2407.18328v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Non-archival Presenting at EDM 2024 Workshop on Large Language Models</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15648">arXiv:2407.15648</a> <span> [<a href="https://arxiv.org/pdf/2407.15648">pdf</a>, <a href="https://arxiv.org/format/2407.15648">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TreeSBA: Tree-Transformer for Self-Supervised Sequential Brick Assembly </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+M">Mengqi Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15648v1-abstract-short" style="display: inline;"> Inferring step-wise actions to assemble 3D objects with primitive bricks from images is a challenging task due to complex constraints and the vast number of possible combinations. Recent studies have demonstrated promising results on sequential LEGO brick assembly through the utilization of LEGO-Graph modeling to predict sequential actions. However, existing approaches are class-specific and requi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15648v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15648v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15648v1-abstract-full" style="display: none;"> Inferring step-wise actions to assemble 3D objects with primitive bricks from images is a challenging task due to complex constraints and the vast number of possible combinations. Recent studies have demonstrated promising results on sequential LEGO brick assembly through the utilization of LEGO-Graph modeling to predict sequential actions. However, existing approaches are class-specific and require significant computational and 3D annotation resources. In this work, we first propose a computationally efficient breadth-first search (BFS) LEGO-Tree structure to model the sequential assembly actions by considering connections between consecutive layers. Based on the LEGO-Tree structure, we then design a class-agnostic tree-transformer framework to predict the sequential assembly actions from the input multi-view images. A major challenge of the sequential brick assembly task is that the step-wise action labels are costly and tedious to obtain in practice. We mitigate this problem by leveraging synthetic-to-real transfer learning. Specifically, our model is first pre-trained on synthetic data with full supervision from the available action labels. We then circumvent the requirement for action labels in the real data by proposing an action-to-silhouette projection that replaces action labels with input image silhouettes for self-supervision. Without any annotation on the real data, our model outperforms existing methods with 3D supervision by 7.8% and 11.3% in mIoU on the MNIST and ModelNet Construction datasets, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15648v1-abstract-full').style.display = 'none'; document.getElementById('2407.15648v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13808">arXiv:2407.13808</a> <span> [<a href="https://arxiv.org/pdf/2407.13808">pdf</a>, <a href="https://arxiv.org/format/2407.13808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CoAPT: Context Attribute words for Prompt Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gun Lee</a>, <a href="/search/cs?searchtype=author&query=An%2C+S">Subin An</a>, <a href="/search/cs?searchtype=author&query=Baik%2C+S">Sungyong Baik</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Soochahn Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13808v1-abstract-short" style="display: inline;"> We propose a novel prompt tuning method called CoAPT(Context Attribute words in Prompt Tuning) for few/zero-shot image classification. The core motivation is that attributes are descriptive words with rich information about a given concept. Thus, we aim to enrich text queries of existing prompt tuning methods, improving alignment between text and image embeddings in CLIP embedding space. To do so,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13808v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13808v1-abstract-full" style="display: none;"> We propose a novel prompt tuning method called CoAPT(Context Attribute words in Prompt Tuning) for few/zero-shot image classification. The core motivation is that attributes are descriptive words with rich information about a given concept. Thus, we aim to enrich text queries of existing prompt tuning methods, improving alignment between text and image embeddings in CLIP embedding space. To do so, CoAPT integrates attribute words as additional prompts within learnable prompt tuning and can be easily incorporated into various existing prompt tuning methods. To facilitate the incorporation of attributes into text embeddings and the alignment with image embeddings, soft prompts are trained together with an additional meta-network that generates input-image-wise feature biases from the concatenated feature encodings of the image-text combined queries. Our experiments demonstrate that CoAPT leads to considerable improvements for existing baseline methods on several few/zero-shot image classification tasks, including base-to-novel generalization, cross-dataset transfer, and domain generalization. Our findings highlight the importance of combining hard and soft prompts and pave the way for future research on the interplay between text and image latent spaces in pre-trained models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13808v1-abstract-full').style.display = 'none'; document.getElementById('2407.13808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12508">arXiv:2407.12508</a> <span> [<a href="https://arxiv.org/pdf/2407.12508">pdf</a>, <a href="https://arxiv.org/format/2407.12508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MERLIN: Multimodal Embedding Refinement via LLM-based Iterative Navigation for Text-Video Retrieval-Rerank Pipeline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+D">Donghoon Han</a>, <a href="/search/cs?searchtype=author&query=Park%2C+E">Eunhwan Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gisang Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+A">Adam Lee</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+N">Nojun Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12508v2-abstract-short" style="display: inline;"> The rapid expansion of multimedia content has made accurately retrieving relevant videos from large collections increasingly challenging. Recent advancements in text-video retrieval have focused on cross-modal interactions, large-scale foundation model training, and probabilistic modeling, yet often neglect the crucial user perspective, leading to discrepancies between user queries and the content… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12508v2-abstract-full').style.display = 'inline'; document.getElementById('2407.12508v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12508v2-abstract-full" style="display: none;"> The rapid expansion of multimedia content has made accurately retrieving relevant videos from large collections increasingly challenging. Recent advancements in text-video retrieval have focused on cross-modal interactions, large-scale foundation model training, and probabilistic modeling, yet often neglect the crucial user perspective, leading to discrepancies between user queries and the content retrieved. To address this, we introduce MERLIN (Multimodal Embedding Refinement via LLM-based Iterative Navigation), a novel, training-free pipeline that leverages Large Language Models (LLMs) for iterative feedback learning. MERLIN refines query embeddings from a user perspective, enhancing alignment between queries and video content through a dynamic question answering process. Experimental results on datasets like MSR-VTT, MSVD, and ActivityNet demonstrate that MERLIN substantially improves Recall@1, outperforming existing systems and confirming the benefits of integrating LLMs into multimodal retrieval systems for more responsive and context-aware multimedia retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12508v2-abstract-full').style.display = 'none'; document.getElementById('2407.12508v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Industry Track Accepted (Camera-Ready Version)</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lee%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lee%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lee%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lee%2C+G&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lee%2C+G&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lee%2C+G&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository