CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 269 results for author: <span class="mathjax">Lim, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lim%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lim, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lim%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lim, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lim%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08640">arXiv:2502.08640</a> <span> [<a href="https://arxiv.org/pdf/2502.08640">pdf</a>, <a href="https://arxiv.org/format/2502.08640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Utility Engineering: Analyzing and Controlling Emergent Value Systems in AIs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mazeika%2C+M">Mantas Mazeika</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xuwang Yin</a>, <a href="/search/cs?searchtype=author&query=Tamirisa%2C+R">Rishub Tamirisa</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jaehyuk Lim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B+W">Bruce W. Lee</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+R">Richard Ren</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+L">Long Phan</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+N">Norman Mu</a>, <a href="/search/cs?searchtype=author&query=Khoja%2C+A">Adam Khoja</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+O">Oliver Zhang</a>, <a href="/search/cs?searchtype=author&query=Hendrycks%2C+D">Dan Hendrycks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08640v1-abstract-short" style="display: inline;"> As AIs rapidly advance and become more agentic, the risk they pose is governed not only by their capabilities but increasingly by their propensities, including goals and values. Tracking the emergence of goals and values has proven a longstanding problem, and despite much interest over the years it remains unclear whether current AIs have meaningful values. We propose a solution to this problem, l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08640v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08640v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08640v1-abstract-full" style="display: none;"> As AIs rapidly advance and become more agentic, the risk they pose is governed not only by their capabilities but increasingly by their propensities, including goals and values. Tracking the emergence of goals and values has proven a longstanding problem, and despite much interest over the years it remains unclear whether current AIs have meaningful values. We propose a solution to this problem, leveraging the framework of utility functions to study the internal coherence of AI preferences. Surprisingly, we find that independently-sampled preferences in current LLMs exhibit high degrees of structural coherence, and moreover that this emerges with scale. These findings suggest that value systems emerge in LLMs in a meaningful sense, a finding with broad implications. To study these emergent value systems, we propose utility engineering as a research agenda, comprising both the analysis and control of AI utilities. We uncover problematic and often shocking values in LLM assistants despite existing control measures. These include cases where AIs value themselves over humans and are anti-aligned with specific individuals. To constrain these emergent value systems, we propose methods of utility control. As a case study, we show how aligning utilities with a citizen assembly reduces political biases and generalizes to new scenarios. Whether we like it or not, value systems have already emerged in AIs, and much work remains to fully understand and control these emergent representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08640v1-abstract-full').style.display = 'none'; document.getElementById('2502.08640v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07794">arXiv:2502.07794</a> <span> [<a href="https://arxiv.org/pdf/2502.07794">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Regulatory Science Innovation for Generative AI and Large Language Models in Health and Medicine: A Global Call for Action </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ong%2C+J+C+L">Jasmine Chiat Ling Ong</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Y">Yilin Ning</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yian Ma</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhao Liang</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+K">Kuldev Singh</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+R+T">Robert T Chang</a>, <a href="/search/cs?searchtype=author&query=Vogel%2C+S">Silke Vogel</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+C">John CW Lim</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+I+S+K">Iris Siu Kwan Tan</a>, <a href="/search/cs?searchtype=author&query=Freyer%2C+O">Oscar Freyer</a>, <a href="/search/cs?searchtype=author&query=Gilbert%2C+S">Stephen Gilbert</a>, <a href="/search/cs?searchtype=author&query=Bitterman%2C+D+S">Danielle S Bitterman</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Denniston%2C+A+K">Alastair K Denniston</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Nan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07794v1-abstract-short" style="display: inline;"> The integration of generative AI (GenAI) and large language models (LLMs) in healthcare presents both unprecedented opportunities and challenges, necessitating innovative regulatory approaches. GenAI and LLMs offer broad applications, from automating clinical workflows to personalizing diagnostics. However, the non-deterministic outputs, broad functionalities and complex integration of GenAI and L… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07794v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07794v1-abstract-full" style="display: none;"> The integration of generative AI (GenAI) and large language models (LLMs) in healthcare presents both unprecedented opportunities and challenges, necessitating innovative regulatory approaches. GenAI and LLMs offer broad applications, from automating clinical workflows to personalizing diagnostics. However, the non-deterministic outputs, broad functionalities and complex integration of GenAI and LLMs challenge existing medical device regulatory frameworks, including the total product life cycle (TPLC) approach. Here we discuss the constraints of the TPLC approach to GenAI and LLM-based medical device regulation, and advocate for global collaboration in regulatory science research. This serves as the foundation for developing innovative approaches including adaptive policies and regulatory sandboxes, to test and refine governance in real-world settings. International harmonization, as seen with the International Medical Device Regulators Forum, is essential to manage implications of LLM on global health, including risks of widening health inequities driven by inherent model biases. By engaging multidisciplinary expertise, prioritizing iterative, data-driven approaches, and focusing on the needs of diverse populations, global regulatory science research enables the responsible and equitable advancement of LLM innovations in healthcare. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07794v1-abstract-full').style.display = 'none'; document.getElementById('2502.07794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16973">arXiv:2501.16973</a> <span> [<a href="https://arxiv.org/pdf/2501.16973">pdf</a>, <a href="https://arxiv.org/format/2501.16973">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Towards Open-Source and Modular Space Systems with ATMOS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Roque%2C+P">Pedro Roque</a>, <a href="/search/cs?searchtype=author&query=Phodapol%2C+S">Sujet Phodapol</a>, <a href="/search/cs?searchtype=author&query=Krantz%2C+E">Elias Krantz</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jaeyoung Lim</a>, <a href="/search/cs?searchtype=author&query=Verhagen%2C+J">Joris Verhagen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Frank Jiang</a>, <a href="/search/cs?searchtype=author&query=Dorner%2C+D">David Dorner</a>, <a href="/search/cs?searchtype=author&query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&query=Stenius%2C+I">Ivan Stenius</a>, <a href="/search/cs?searchtype=author&query=Tibert%2C+G">Gunnar Tibert</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+H">Huina Mao</a>, <a href="/search/cs?searchtype=author&query=Tumova%2C+J">Jana Tumova</a>, <a href="/search/cs?searchtype=author&query=Fuglesang%2C+C">Christer Fuglesang</a>, <a href="/search/cs?searchtype=author&query=Dimarogonas%2C+D+V">Dimos V. Dimarogonas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16973v1-abstract-short" style="display: inline;"> In the near future, autonomous space systems will compose a large number of the spacecraft being deployed. Their tasks will involve autonomous rendezvous and proximity operations with large structures, such as inspections or assembly of orbiting space stations and maintenance and human-assistance tasks over shared workspaces. To promote replicable and reliable scientific results for autonomous con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16973v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16973v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16973v1-abstract-full" style="display: none;"> In the near future, autonomous space systems will compose a large number of the spacecraft being deployed. Their tasks will involve autonomous rendezvous and proximity operations with large structures, such as inspections or assembly of orbiting space stations and maintenance and human-assistance tasks over shared workspaces. To promote replicable and reliable scientific results for autonomous control of spacecraft, we present the design of a space systems laboratory based on open-source and modular software and hardware. The simulation software provides a software-in-the-loop (SITL) architecture that seamlessly transfers simulated results to the ATMOS platforms, developed for testing of multi-agent autonomy schemes for microgravity. The manuscript presents the KTH space systems laboratory facilities and the ATMOS platform as open-source hardware and software contributions. Preliminary results showcase SITL and real testing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16973v1-abstract-full').style.display = 'none'; document.getElementById('2501.16973v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preliminary release, to be submitted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15839">arXiv:2501.15839</a> <span> [<a href="https://arxiv.org/pdf/2501.15839">pdf</a>, <a href="https://arxiv.org/format/2501.15839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Controllable Hand Grasp Generation for HOI and Efficient Evaluation Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ishant"> Ishant</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Rongliang Wu</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+H">Joo Hwee Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15839v1-abstract-short" style="display: inline;"> Controllable affordance Hand-Object Interaction (HOI) generation has become an increasingly important area of research in computer vision. In HOI generation, the hand grasp generation is a crucial step for effectively controlling the geometry of the hand. Current hand grasp generation methods rely on 3D information for both the hand and the object. In addition, these methods lack controllability c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15839v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15839v1-abstract-full" style="display: none;"> Controllable affordance Hand-Object Interaction (HOI) generation has become an increasingly important area of research in computer vision. In HOI generation, the hand grasp generation is a crucial step for effectively controlling the geometry of the hand. Current hand grasp generation methods rely on 3D information for both the hand and the object. In addition, these methods lack controllability concerning the hand's location and orientation. We treat the hand pose as the discrete graph structure and exploit the geometric priors. It is well established that higher order contextual dependency among the points improves the quality of the results in general. We propose a framework of higher order geometric representations (HOR's) inspired by spectral graph theory and vector algebra to improve the quality of generated hand poses. We demonstrate the effectiveness of our proposed HOR's in devising a controllable novel diffusion method (based on 2D information) for hand grasp generation that outperforms the state of the art (SOTA). Overcoming the limitations of existing methods: like lacking of controllability and dependency on 3D information. Once we have the generated pose, it is very natural to evaluate them using a metric. Popular metrics like FID and MMD are biased and inefficient for evaluating the generated hand poses. Using our proposed HOR's, we introduce an efficient and stable framework of evaluation metrics for grasp generation methods, addressing inefficiencies and biases in FID and MMD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15839v1-abstract-full').style.display = 'none'; document.getElementById('2501.15839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09672">arXiv:2501.09672</a> <span> [<a href="https://arxiv.org/pdf/2501.09672">pdf</a>, <a href="https://arxiv.org/format/2501.09672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP Evaluation Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Roger%2C+A">Alexis Roger</a>, <a href="/search/cs?searchtype=author&query=Humane%2C+P">Prateek Humane</a>, <a href="/search/cs?searchtype=author&query=Kaplan%2C+D+Z">Daniel Z. Kaplan</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+K">Kshitij Gupta</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qi Sun</a>, <a href="/search/cs?searchtype=author&query=Adamopoulos%2C+G">George Adamopoulos</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+S+C">Jonathan Siu Chi Lim</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+Q">Quentin Anthony</a>, <a href="/search/cs?searchtype=author&query=Fennell%2C+E">Edwin Fennell</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09672v2-abstract-short" style="display: inline;"> The proliferation of Vision-Language Models (VLMs) in the past several years calls for rigorous and comprehensive evaluation methods and benchmarks. This work analyzes existing VLM evaluation techniques, including automated metrics, AI-based assessments, and human evaluations across diverse tasks. We first introduce Robin - a novel suite of VLMs that we built by combining Large Language Models (LL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09672v2-abstract-full').style.display = 'inline'; document.getElementById('2501.09672v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09672v2-abstract-full" style="display: none;"> The proliferation of Vision-Language Models (VLMs) in the past several years calls for rigorous and comprehensive evaluation methods and benchmarks. This work analyzes existing VLM evaluation techniques, including automated metrics, AI-based assessments, and human evaluations across diverse tasks. We first introduce Robin - a novel suite of VLMs that we built by combining Large Language Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use Robin to identify shortcomings of current evaluation approaches across scales. Next, to overcome the identified limitations, we introduce CHIRP - a new long form response benchmark we developed for more robust and complete VLM evaluation. We provide open access to the Robin training code, model suite, and CHIRP benchmark to promote reproducibility and advance VLM research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09672v2-abstract-full').style.display = 'none'; document.getElementById('2501.09672v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06246">arXiv:2501.06246</a> <span> [<a href="https://arxiv.org/pdf/2501.06246">pdf</a>, <a href="https://arxiv.org/format/2501.06246">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> A partition cover approach to tokenization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J+P">Jia Peng Lim</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+D">Davin Choo</a>, <a href="/search/cs?searchtype=author&query=Lauw%2C+H+W">Hady W. Lauw</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06246v1-abstract-short" style="display: inline;"> Tokenization is the process of encoding strings into tokens from a fixed vocabulary of size $k$ and is widely utilized in Natural Language Processing applications. The leading tokenization algorithm today is Byte Pair Encoding (BPE), which formulates the tokenization problem as a compression problem and tackles it by performing sequences of merges. In this work, we formulate tokenization as an opt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06246v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06246v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06246v1-abstract-full" style="display: none;"> Tokenization is the process of encoding strings into tokens from a fixed vocabulary of size $k$ and is widely utilized in Natural Language Processing applications. The leading tokenization algorithm today is Byte Pair Encoding (BPE), which formulates the tokenization problem as a compression problem and tackles it by performing sequences of merges. In this work, we formulate tokenization as an optimization objective, show that it is NP-hard via a simple reduction from vertex cover, and propose a polynomial-time greedy algorithm GreedTok. Our formulation naturally relaxes to the well-studied weighted maximum coverage problem which has a simple $(1 - 1/e)$-approximation algorithm GreedWMC. Through empirical evaluations on real-world corpora, we show that GreedTok outperforms BPE, while achieving a comparable objective score as GreedWMC (which could have achieved a higher score due to relaxation). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06246v1-abstract-full').style.display = 'none'; document.getElementById('2501.06246v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00210">arXiv:2501.00210</a> <span> [<a href="https://arxiv.org/pdf/2501.00210">pdf</a>, <a href="https://arxiv.org/format/2501.00210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Debunking the CUDA Myth Towards GPU-based AI Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yunjae Lee</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Juntaek Lim</a>, <a href="/search/cs?searchtype=author&query=Bang%2C+J">Jehyeon Bang</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+E">Eunyeong Cho</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+H">Huijong Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taesu Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyungjun Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joonhyung Lee</a>, <a href="/search/cs?searchtype=author&query=Im%2C+J">Jinseop Im</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+R">Ranggi Hwang</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+S+J">Se Jung Kwon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dongsoo Lee</a>, <a href="/search/cs?searchtype=author&query=Rhu%2C+M">Minsoo Rhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00210v1-abstract-short" style="display: inline;"> With the rise of AI, NVIDIA GPUs have become the de facto standard for AI system design. This paper presents a comprehensive evaluation of Intel Gaudi NPUs as an alternative to NVIDIA GPUs for AI model serving. First, we create a suite of microbenchmarks to compare Intel Gaudi-2 with NVIDIA A100, showing that Gaudi-2 achieves competitive performance not only in primitive AI compute, memory, and co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00210v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00210v1-abstract-full" style="display: none;"> With the rise of AI, NVIDIA GPUs have become the de facto standard for AI system design. This paper presents a comprehensive evaluation of Intel Gaudi NPUs as an alternative to NVIDIA GPUs for AI model serving. First, we create a suite of microbenchmarks to compare Intel Gaudi-2 with NVIDIA A100, showing that Gaudi-2 achieves competitive performance not only in primitive AI compute, memory, and communication operations but also in executing several important AI workloads end-to-end. We then assess Gaudi NPU's programmability by discussing several software-level optimization strategies to employ for implementing critical FBGEMM operators and vLLM, evaluating their efficiency against GPU-optimized counterparts. Results indicate that Gaudi-2 achieves energy efficiency comparable to A100, though there are notable areas for improvement in terms of software maturity. Overall, we conclude that, with effective integration into high-level AI frameworks, Gaudi NPUs could challenge NVIDIA GPU's dominance in the AI server market, though further improvements are necessary to fully compete with NVIDIA's robust software ecosystem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00210v1-abstract-full').style.display = 'none'; document.getElementById('2501.00210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11452">arXiv:2412.11452</a> <span> [<a href="https://arxiv.org/pdf/2412.11452">pdf</a>, <a href="https://arxiv.org/format/2412.11452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Multilabel Classification for Lung Disease Detection: Integrating Deep Learning and Natural Language Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Efimovich%2C+M">Maria Efimovich</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jayden Lim</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+V">Vedant Mehta</a>, <a href="/search/cs?searchtype=author&query=Poon%2C+E">Ethan Poon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11452v1-abstract-short" style="display: inline;"> Classifying chest radiographs is a time-consuming and challenging task, even for experienced radiologists. This provides an area for improvement due to the difficulty in precisely distinguishing between conditions such as pleural effusion, pneumothorax, and pneumonia. We propose a novel transfer learning model for multi-label lung disease classification, utilizing the CheXpert dataset with over 12… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11452v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11452v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11452v1-abstract-full" style="display: none;"> Classifying chest radiographs is a time-consuming and challenging task, even for experienced radiologists. This provides an area for improvement due to the difficulty in precisely distinguishing between conditions such as pleural effusion, pneumothorax, and pneumonia. We propose a novel transfer learning model for multi-label lung disease classification, utilizing the CheXpert dataset with over 12,617 images of frontal radiographs being analyzed. By integrating RadGraph parsing for efficient annotation extraction, we enhance the model's ability to accurately classify multiple lung diseases from complex medical images. The proposed model achieved an F1 score of 0.69 and an AUROC of 0.86, demonstrating its potential for clinical applications. Also explored was the use of Natural Language Processing (NLP) to parse report metadata and address uncertainties in disease classification. By comparing uncertain reports with more certain cases, the NLP-enhanced model improves its ability to conclusively classify conditions. This research highlights the connection between deep learning and NLP, underscoring their potential to enhance radiological diagnostics and aid in the efficient analysis of chest radiographs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11452v1-abstract-full').style.display = 'none'; document.getElementById('2412.11452v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">All authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11391">arXiv:2412.11391</a> <span> [<a href="https://arxiv.org/pdf/2412.11391">pdf</a>, <a href="https://arxiv.org/ps/2412.11391">ps</a>, <a href="https://arxiv.org/format/2412.11391">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Temporal Contrastive Learning for Video Temporal Reasoning in Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Souza%2C+R">Rafael Souza</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jia-Hao Lim</a>, <a href="/search/cs?searchtype=author&query=Davis%2C+A">Alexander Davis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11391v1-abstract-short" style="display: inline;"> Temporal reasoning is a critical challenge in video-language understanding, as it requires models to align semantic concepts consistently across time. While existing large vision-language models (LVLMs) and large language models (LLMs) excel at static tasks, they struggle to capture dynamic interactions and temporal dependencies in video sequences. In this work, we propose Temporal Semantic Alignm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11391v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11391v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11391v1-abstract-full" style="display: none;"> Temporal reasoning is a critical challenge in video-language understanding, as it requires models to align semantic concepts consistently across time. While existing large vision-language models (LVLMs) and large language models (LLMs) excel at static tasks, they struggle to capture dynamic interactions and temporal dependencies in video sequences. In this work, we propose Temporal Semantic Alignment via Dynamic Prompting (TSADP), a novel framework that enhances temporal reasoning capabilities through dynamic task-specific prompts and temporal contrastive learning. TSADP leverages a Dynamic Prompt Generator (DPG) to encode fine-grained temporal relationships and a Temporal Contrastive Loss (TCL) to align visual and textual embeddings across time. We evaluate our method on the VidSitu dataset, augmented with enriched temporal annotations, and demonstrate significant improvements over state-of-the-art models in tasks such as Intra-Video Entity Association, Temporal Relationship Understanding, and Chronology Prediction. Human evaluations further confirm TSADP's ability to generate coherent and semantically accurate descriptions. Our analysis highlights the robustness, efficiency, and practical utility of TSADP, making it a step forward in the field of video-language understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11391v1-abstract-full').style.display = 'none'; document.getElementById('2412.11391v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10436">arXiv:2412.10436</a> <span> [<a href="https://arxiv.org/pdf/2412.10436">pdf</a>, <a href="https://arxiv.org/format/2412.10436">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Federated Learning for Semantic Datasets: Federated Scene Graph Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ha%2C+S">SeungBum Ha</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+T">Taehwan Lee</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jiyoun Lim</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+S+W">Sung Whan Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10436v1-abstract-short" style="display: inline;"> Federated learning (FL) has recently garnered attention as a data-decentralized training framework that enables the learning of deep models from locally distributed samples while keeping data privacy. Built upon the framework, immense efforts have been made to establish FL benchmarks, which provide rigorous evaluation settings that control data heterogeneity across clients. Prior efforts have main… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10436v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10436v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10436v1-abstract-full" style="display: none;"> Federated learning (FL) has recently garnered attention as a data-decentralized training framework that enables the learning of deep models from locally distributed samples while keeping data privacy. Built upon the framework, immense efforts have been made to establish FL benchmarks, which provide rigorous evaluation settings that control data heterogeneity across clients. Prior efforts have mainly focused on handling relatively simple classification tasks, where each sample is annotated with a one-hot label, such as MNIST, CIFAR, LEAF benchmark, etc. However, little attention has been paid to demonstrating an FL benchmark that handles complicated semantics, where each sample encompasses diverse semantic information from multiple labels, such as Panoptic Scene Graph Generation (PSG) with objects, subjects, and relations between them. Because the existing benchmark is designed to distribute data in a narrow view of a single semantic, e.g., a one-hot label, managing the complicated semantic heterogeneity across clients when formalizing FL benchmarks is non-trivial. In this paper, we propose a benchmark process to establish an FL benchmark with controllable semantic heterogeneity across clients: two key steps are i) data clustering with semantics and ii) data distributing via controllable semantic heterogeneity across clients. As a proof of concept, we first construct a federated PSG benchmark, demonstrating the efficacy of the existing PSG methods in an FL setting with controllable semantic heterogeneity of scene graphs. We also present the effectiveness of our benchmark by applying robust federated learning algorithms to data heterogeneity to show increased performance. Our code is available at https://github.com/Seung-B/FL-PSG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10436v1-abstract-full').style.display = 'none'; document.getElementById('2412.10436v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07251">arXiv:2412.07251</a> <span> [<a href="https://arxiv.org/pdf/2412.07251">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> KULTURE Bench: A Benchmark for Assessing Language Model in Korean Cultural Context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaonan Wang</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+J">Jinyoung Yeo</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Joon-Ho Lim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hansaem Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07251v1-abstract-short" style="display: inline;"> Large language models have exhibited significant enhancements in performance across various tasks. However, the complexity of their evaluation increases as these models generate more fluent and coherent content. Current multilingual benchmarks often use translated English versions, which may incorporate Western cultural biases that do not accurately assess other languages and cultures. To address… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07251v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07251v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07251v1-abstract-full" style="display: none;"> Large language models have exhibited significant enhancements in performance across various tasks. However, the complexity of their evaluation increases as these models generate more fluent and coherent content. Current multilingual benchmarks often use translated English versions, which may incorporate Western cultural biases that do not accurately assess other languages and cultures. To address this research gap, we introduce KULTURE Bench, an evaluation framework specifically designed for Korean culture that features datasets of cultural news, idioms, and poetry. It is designed to assess language models' cultural comprehension and reasoning capabilities at the word, sentence, and paragraph levels. Using the KULTURE Bench, we assessed the capabilities of models trained with different language corpora and analyzed the results comprehensively. The results show that there is still significant room for improvement in the models' understanding of texts related to the deeper aspects of Korean culture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07251v1-abstract-full').style.display = 'none'; document.getElementById('2412.07251v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 38th Pacific Asia Conference on Language, Information and Computation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10702">arXiv:2411.10702</a> <span> [<a href="https://arxiv.org/pdf/2411.10702">pdf</a>, <a href="https://arxiv.org/format/2411.10702">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Wireless Resource Allocation with Collaborative Distributed and Centralized DRL under Control Channel Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wanchun Liu</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+T+J">Teng Joon Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10702v1-abstract-short" style="display: inline;"> In this paper, we consider a wireless resource allocation problem in a cyber-physical system (CPS) where the control channel, carrying resource allocation commands, is subjected to denial-of-service (DoS) attacks. We propose a novel concept of collaborative distributed and centralized (CDC) resource allocation to effectively mitigate the impact of these attacks. To optimize the CDC resource alloca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10702v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10702v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10702v1-abstract-full" style="display: none;"> In this paper, we consider a wireless resource allocation problem in a cyber-physical system (CPS) where the control channel, carrying resource allocation commands, is subjected to denial-of-service (DoS) attacks. We propose a novel concept of collaborative distributed and centralized (CDC) resource allocation to effectively mitigate the impact of these attacks. To optimize the CDC resource allocation policy, we develop a new CDC-deep reinforcement learning (DRL) algorithm, whereas existing DRL frameworks only formulate either centralized or distributed decision-making problems. Simulation results demonstrate that the CDC-DRL algorithm significantly outperforms state-of-the-art DRL benchmarks, showcasing its ability to address resource allocation problems in large-scale CPSs under control channel attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10702v1-abstract-full').style.display = 'none'; document.getElementById('2411.10702v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09255">arXiv:2411.09255</a> <span> [<a href="https://arxiv.org/pdf/2411.09255">pdf</a>, <a href="https://arxiv.org/format/2411.09255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DAHL: Domain-specific Automated Hallucination Evaluation of Long-Form Text through a Benchmark Dataset in Biomedicine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Seo%2C+J">Jean Seo</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jongwon Lim</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+D">Dongjun Jang</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+H">Hyopil Shin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09255v1-abstract-short" style="display: inline;"> We introduce DAHL, a benchmark dataset and automated evaluation system designed to assess hallucination in long-form text generation, specifically within the biomedical domain. Our benchmark dataset, meticulously curated from biomedical research papers, consists of 8,573 questions across 29 categories. DAHL evaluates fact-conflicting hallucinations in Large Language Models (LLMs) by deconstructing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09255v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09255v1-abstract-full" style="display: none;"> We introduce DAHL, a benchmark dataset and automated evaluation system designed to assess hallucination in long-form text generation, specifically within the biomedical domain. Our benchmark dataset, meticulously curated from biomedical research papers, consists of 8,573 questions across 29 categories. DAHL evaluates fact-conflicting hallucinations in Large Language Models (LLMs) by deconstructing responses into atomic units, each representing a single piece of information. The accuracy of these responses is averaged to produce the DAHL Score, offering a more in-depth evaluation of hallucinations compared to previous methods that rely on multiple-choice tasks. We conduct experiments with 8 different models, finding that larger models tend to hallucinate less; however, beyond a model size of 7 to 8 billion parameters, further scaling does not significantly improve factual accuracy. The DAHL Score holds potential as an efficient alternative to human-annotated preference labels, being able to be expanded to other specialized domains. We release the dataset and code in public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09255v1-abstract-full').style.display = 'none'; document.getElementById('2411.09255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP2024/FEVER</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08879">arXiv:2411.08879</a> <span> [<a href="https://arxiv.org/pdf/2411.08879">pdf</a>, <a href="https://arxiv.org/format/2411.08879">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> 4D Gaussian Splatting in the Wild with Uncertainty-Aware Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+M">Mijeong Kim</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jongwoo Lim</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bohyung Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08879v1-abstract-short" style="display: inline;"> Novel view synthesis of dynamic scenes is becoming important in various applications, including augmented and virtual reality. We propose a novel 4D Gaussian Splatting (4DGS) algorithm for dynamic scenes from casually recorded monocular videos. To overcome the overfitting problem of existing work for these real-world videos, we introduce an uncertainty-aware regularization that identifies uncertai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08879v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08879v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08879v1-abstract-full" style="display: none;"> Novel view synthesis of dynamic scenes is becoming important in various applications, including augmented and virtual reality. We propose a novel 4D Gaussian Splatting (4DGS) algorithm for dynamic scenes from casually recorded monocular videos. To overcome the overfitting problem of existing work for these real-world videos, we introduce an uncertainty-aware regularization that identifies uncertain regions with few observations and selectively imposes additional priors based on diffusion models and depth smoothness on such regions. This approach improves both the performance of novel view synthesis and the quality of training image reconstruction. We also identify the initialization problem of 4DGS in fast-moving dynamic regions, where the Structure from Motion (SfM) algorithm fails to provide reliable 3D landmarks. To initialize Gaussian primitives in such regions, we present a dynamic region densification method using the estimated depth maps and scene flow. Our experiments show that the proposed method improves the performance of 4DGS reconstruction from a video captured by a handheld monocular camera and also exhibits promising results in few-shot static scene reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08879v1-abstract-full').style.display = 'none'; document.getElementById('2411.08879v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06702">arXiv:2411.06702</a> <span> [<a href="https://arxiv.org/pdf/2411.06702">pdf</a>, <a href="https://arxiv.org/format/2411.06702">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Track Any Peppers: Weakly Supervised Sweet Pepper Tracking Using VLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J+S">Jia Syuen Lim</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yadan Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+T">Tianqi Wei</a>, <a href="/search/cs?searchtype=author&query=Chapman%2C+S">Scott Chapman</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zi Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06702v1-abstract-short" style="display: inline;"> In the Detection and Multi-Object Tracking of Sweet Peppers Challenge, we present Track Any Peppers (TAP) - a weakly supervised ensemble technique for sweet peppers tracking. TAP leverages the zero-shot detection capabilities of vision-language foundation models like Grounding DINO to automatically generate pseudo-labels for sweet peppers in video sequences with minimal human intervention. These p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06702v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06702v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06702v1-abstract-full" style="display: none;"> In the Detection and Multi-Object Tracking of Sweet Peppers Challenge, we present Track Any Peppers (TAP) - a weakly supervised ensemble technique for sweet peppers tracking. TAP leverages the zero-shot detection capabilities of vision-language foundation models like Grounding DINO to automatically generate pseudo-labels for sweet peppers in video sequences with minimal human intervention. These pseudo-labels, refined when necessary, are used to train a YOLOv8 segmentation network. To enhance detection accuracy under challenging conditions, we incorporate pre-processing techniques such as relighting adjustments and apply depth-based filtering during post-inference. For object tracking, we integrate the Matching by Segment Anything (MASA) adapter with the BoT-SORT algorithm. Our approach achieves a HOTA score of 80.4%, MOTA of 66.1%, Recall of 74.0%, and Precision of 90.7%, demonstrating effective tracking of sweet peppers without extensive manual effort. This work highlights the potential of foundation models for efficient and accurate object detection and tracking in agricultural settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06702v1-abstract-full').style.display = 'none'; document.getElementById('2411.06702v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17421">arXiv:2410.17421</a> <span> [<a href="https://arxiv.org/pdf/2410.17421">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> From an attention economy to an ecology of attending. A manifesto </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bombaerts%2C+G">Gunter Bombaerts</a>, <a href="/search/cs?searchtype=author&query=Hannes%2C+T">Tom Hannes</a>, <a href="/search/cs?searchtype=author&query=Adam%2C+M">Martin Adam</a>, <a href="/search/cs?searchtype=author&query=Aloisi%2C+A">Alessandra Aloisi</a>, <a href="/search/cs?searchtype=author&query=Anderson%2C+J">Joel Anderson</a>, <a href="/search/cs?searchtype=author&query=Berger%2C+L">Lawrence Berger</a>, <a href="/search/cs?searchtype=author&query=Bettera%2C+S+D">Stefano Davide Bettera</a>, <a href="/search/cs?searchtype=author&query=Campo%2C+E">Enrico Campo</a>, <a href="/search/cs?searchtype=author&query=Candiotto%2C+L">Laura Candiotto</a>, <a href="/search/cs?searchtype=author&query=Panizza%2C+S+C">Silvia Caprioglio Panizza</a>, <a href="/search/cs?searchtype=author&query=Citton%2C+Y">Yves Citton</a>, <a href="/search/cs?searchtype=author&query=D%C3%A2%C2%80%C2%99Angelo%2C+D">Diego D芒聙聶Angelo</a>, <a href="/search/cs?searchtype=author&query=Dennis%2C+M">Matthew Dennis</a>, <a href="/search/cs?searchtype=author&query=Depraz%2C+N">Nathalie Depraz</a>, <a href="/search/cs?searchtype=author&query=Doran%2C+P">Peter Doran</a>, <a href="/search/cs?searchtype=author&query=Drechsler%2C+W">Wolfgang Drechsler</a>, <a href="/search/cs?searchtype=author&query=Duane%2C+B">Bill Duane</a>, <a href="/search/cs?searchtype=author&query=Edelglass%2C+W">William Edelglass</a>, <a href="/search/cs?searchtype=author&query=Eisenberger%2C+I">Iris Eisenberger</a>, <a href="/search/cs?searchtype=author&query=McGuire%2C+B+F">Beverley Foulks McGuire</a>, <a href="/search/cs?searchtype=author&query=Fredriksson%2C+A">Antony Fredriksson</a>, <a href="/search/cs?searchtype=author&query=Gill%2C+K+S">Karamjit S. Gill</a>, <a href="/search/cs?searchtype=author&query=Hershock%2C+P+D">Peter D. Hershock</a>, <a href="/search/cs?searchtype=author&query=Hongladarom%2C+S">Soraj Hongladarom</a>, <a href="/search/cs?searchtype=author&query=Jacobs%2C+B">Beth Jacobs</a> , et al. (30 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17421v1-abstract-short" style="display: inline;"> As the signatories of this manifesto, we denounce the attention economy as inhumane and a threat to our sociopolitical and ecological well-being. We endorse policymakers' efforts to address the negative consequences of the attention economy's technology, but add that these approaches are often limited in their criticism of the systemic context of human attention. Starting from Buddhist philosophy,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17421v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17421v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17421v1-abstract-full" style="display: none;"> As the signatories of this manifesto, we denounce the attention economy as inhumane and a threat to our sociopolitical and ecological well-being. We endorse policymakers' efforts to address the negative consequences of the attention economy's technology, but add that these approaches are often limited in their criticism of the systemic context of human attention. Starting from Buddhist philosophy, we advocate a broader approach: an ecology of attending, that centers on conceptualizing, designing, and using attention (1) in an embedded way and (2) focused on the alleviating of suffering. With 'embedded' we mean that attention is not a neutral, isolated mechanism but a meaning-engendering part of an 'ecology' of bodily, sociotechnical and moral frameworks. With 'focused on the alleviation of suffering' we explicitly move away from the (often implicit) conception of attention as a tool for gratifying desires. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17421v1-abstract-full').style.display = 'none'; document.getElementById('2410.17421v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16038">arXiv:2410.16038</a> <span> [<a href="https://arxiv.org/pdf/2410.16038">pdf</a>, <a href="https://arxiv.org/format/2410.16038">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Pathology Foundation Models: Adaptation Strategies and Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jeaung Lee</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jeewoo Lim</a>, <a href="/search/cs?searchtype=author&query=Byeon%2C+K">Keunho Byeon</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+J+T">Jin Tae Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16038v1-abstract-short" style="display: inline;"> In computational pathology, several foundation models have recently emerged and demonstrated enhanced learning capability for analyzing pathology images. However, adapting these models to various downstream tasks remains challenging, particularly when faced with datasets from different sources and acquisition conditions, as well as limited data availability. In this study, we benchmark four pathol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16038v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16038v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16038v1-abstract-full" style="display: none;"> In computational pathology, several foundation models have recently emerged and demonstrated enhanced learning capability for analyzing pathology images. However, adapting these models to various downstream tasks remains challenging, particularly when faced with datasets from different sources and acquisition conditions, as well as limited data availability. In this study, we benchmark four pathology-specific foundation models across 14 datasets and two scenarios-consistency assessment and flexibility assessment-addressing diverse adaptation scenarios and downstream tasks. In the consistency assessment scenario, involving five fine-tuning methods, we found that the parameter-efficient fine-tuning approach was both efficient and effective for adapting pathology-specific foundation models to diverse datasets within the same downstream task. In the flexibility assessment scenario under data-limited environments, utilizing five few-shot learning methods, we observed that the foundation models benefited more from the few-shot learning methods that involve modification during the testing phase only. These findings provide insights that could guide the deployment of pathology-specific foundation models in real clinical settings, potentially improving the accuracy and reliability of pathology image analysis. The code for this study is available at: https://github.com/QuIIL/BenchmarkingPathologyFoundationModels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16038v1-abstract-full').style.display = 'none'; document.getElementById('2410.16038v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11833">arXiv:2410.11833</a> <span> [<a href="https://arxiv.org/pdf/2410.11833">pdf</a>, <a href="https://arxiv.org/format/2410.11833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Suboptimality of Deterministic Policy Gradients in Complex Q-functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jain%2C+A">Ayush Jain</a>, <a href="/search/cs?searchtype=author&query=Kosaka%2C+N">Norio Kosaka</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinhu Li</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyung-Min Kim</a>, <a href="/search/cs?searchtype=author&query=B%C4%B1y%C4%B1k%2C+E">Erdem B谋y谋k</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+J">Joseph J. Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11833v1-abstract-short" style="display: inline;"> In reinforcement learning, off-policy actor-critic approaches like DDPG and TD3 are based on the deterministic policy gradient. Herein, the Q-function is trained from off-policy environment data and the actor (policy) is trained to maximize the Q-function via gradient ascent. We observe that in complex tasks like dexterous manipulation and restricted locomotion, the Q-value is a complex function o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11833v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11833v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11833v1-abstract-full" style="display: none;"> In reinforcement learning, off-policy actor-critic approaches like DDPG and TD3 are based on the deterministic policy gradient. Herein, the Q-function is trained from off-policy environment data and the actor (policy) is trained to maximize the Q-function via gradient ascent. We observe that in complex tasks like dexterous manipulation and restricted locomotion, the Q-value is a complex function of action, having several local optima or discontinuities. This poses a challenge for gradient ascent to traverse and makes the actor prone to get stuck at local optima. To address this, we introduce a new actor architecture that combines two simple insights: (i) use multiple actors and evaluate the Q-value maximizing action, and (ii) learn surrogates to the Q-function that are simpler to optimize with gradient-based methods. We evaluate tasks such as restricted locomotion, dexterous manipulation, and large discrete-action space recommender systems and show that our actor finds optimal actions more frequently and outperforms alternate actor architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11833v1-abstract-full').style.display = 'none'; document.getElementById('2410.11833v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04817">arXiv:2410.04817</a> <span> [<a href="https://arxiv.org/pdf/2410.04817">pdf</a>, <a href="https://arxiv.org/format/2410.04817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Resource-Efficient Multiview Perception: Integrating Semantic Masking with Masked Autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dakic%2C+K">Kosta Dakic</a>, <a href="/search/cs?searchtype=author&query=Thilakarathna%2C+K">Kanchana Thilakarathna</a>, <a href="/search/cs?searchtype=author&query=Calheiros%2C+R+N">Rodrigo N. Calheiros</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+T+J">Teng Joon Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04817v1-abstract-short" style="display: inline;"> Multiview systems have become a key technology in modern computer vision, offering advanced capabilities in scene understanding and analysis. However, these systems face critical challenges in bandwidth limitations and computational constraints, particularly for resource-limited camera nodes like drones. This paper presents a novel approach for communication-efficient distributed multiview detecti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04817v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04817v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04817v1-abstract-full" style="display: none;"> Multiview systems have become a key technology in modern computer vision, offering advanced capabilities in scene understanding and analysis. However, these systems face critical challenges in bandwidth limitations and computational constraints, particularly for resource-limited camera nodes like drones. This paper presents a novel approach for communication-efficient distributed multiview detection and tracking using masked autoencoders (MAEs). We introduce a semantic-guided masking strategy that leverages pre-trained segmentation models and a tunable power function to prioritize informative image regions. This approach, combined with an MAE, reduces communication overhead while preserving essential visual information. We evaluate our method on both virtual and real-world multiview datasets, demonstrating comparable performance in terms of detection and tracking performance metrics compared to state-of-the-art techniques, even at high masking ratios. Our selective masking algorithm outperforms random masking, maintaining higher accuracy and precision as the masking ratio increases. Furthermore, our approach achieves a significant reduction in transmission data volume compared to baseline methods, thereby balancing multiview tracking performance with communication efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04817v1-abstract-full').style.display = 'none'; document.getElementById('2410.04817v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04601">arXiv:2410.04601</a> <span> [<a href="https://arxiv.org/pdf/2410.04601">pdf</a>, <a href="https://arxiv.org/format/2410.04601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ProtocoLLM: Automatic Evaluation Framework of LLMs on Domain-Specific Scientific Protocol Formulation Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+S">Seungjun Yi</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jaeyoung Lim</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+J">Juyong Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04601v1-abstract-short" style="display: inline;"> Automated generation of scientific protocols executable by robots can significantly accelerate scientific research processes. Large Language Models (LLMs) excel at Scientific Protocol Formulation Tasks (SPFT), but the evaluation of their capabilities rely on human evaluation. Here, we propose a flexible, automatic framework to evaluate LLM's capability on SPFT: ProtocoLLM. This framework prompts t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04601v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04601v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04601v1-abstract-full" style="display: none;"> Automated generation of scientific protocols executable by robots can significantly accelerate scientific research processes. Large Language Models (LLMs) excel at Scientific Protocol Formulation Tasks (SPFT), but the evaluation of their capabilities rely on human evaluation. Here, we propose a flexible, automatic framework to evaluate LLM's capability on SPFT: ProtocoLLM. This framework prompts the target model and GPT-4 to extract pseudocode from biology protocols using only predefined lab actions and evaluates the output of target model using LLAM-EVAL, the pseudocode generated by GPT-4 serving as a baseline and Llama-3 acting as the evaluator. Our adaptable prompt-based evaluation method, LLAM-EVAL, offers significant flexibility in terms of evaluation model, material, criteria, and is free of cost. We evaluate GPT variations, Llama, Mixtral, Gemma, Cohere, and Gemini. Overall, we find that GPT and Cohere is a powerful scientific protocol formulators. We also introduce BIOPROT 2.0, a dataset with biology protocols and corresponding pseudocodes, which can aid LLMs in formulation and evaluation of SPFT. Our work is extensible to assess LLMs on SPFT across various domains and other fields that require protocol generation for specific goals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04601v1-abstract-full').style.display = 'none'; document.getElementById('2410.04601v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to 2024 ACL Rolling Review June Cycle</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11375">arXiv:2409.11375</a> <span> [<a href="https://arxiv.org/pdf/2409.11375">pdf</a>, <a href="https://arxiv.org/format/2409.11375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-OCT-SelfNet: Integrating Self-Supervised Learning with Multi-Source Data Fusion for Enhanced Multi-Class Retinal Disease Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jannat%2C+F">Fatema-E- Jannat</a>, <a href="/search/cs?searchtype=author&query=Gholami%2C+S">Sina Gholami</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+I">Jennifer I. Lim</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+T">Theodore Leng</a>, <a href="/search/cs?searchtype=author&query=Alam%2C+M+N">Minhaj Nur Alam</a>, <a href="/search/cs?searchtype=author&query=Tabkhi%2C+H">Hamed Tabkhi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11375v1-abstract-short" style="display: inline;"> In the medical domain, acquiring large datasets poses significant challenges due to privacy concerns. Nonetheless, the development of a robust deep-learning model for retinal disease diagnosis necessitates a substantial dataset for training. The capacity to generalize effectively on smaller datasets remains a persistent challenge. The scarcity of data presents a significant barrier to the practica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11375v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11375v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11375v1-abstract-full" style="display: none;"> In the medical domain, acquiring large datasets poses significant challenges due to privacy concerns. Nonetheless, the development of a robust deep-learning model for retinal disease diagnosis necessitates a substantial dataset for training. The capacity to generalize effectively on smaller datasets remains a persistent challenge. The scarcity of data presents a significant barrier to the practical implementation of scalable medical AI solutions. To address this issue, we've combined a wide range of data sources to improve performance and generalization to new data by giving it a deeper understanding of the data representation from multi-modal datasets and developed a self-supervised framework based on large language models (LLMs), SwinV2 to gain a deeper understanding of multi-modal dataset representations, enhancing the model's ability to extrapolate to new data for the detection of eye diseases using optical coherence tomography (OCT) images. We adopt a two-phase training methodology, self-supervised pre-training, and fine-tuning on a downstream supervised classifier. An ablation study conducted across three datasets employing various encoder backbones, without data fusion, with low data availability setting, and without self-supervised pre-training scenarios, highlights the robustness of our method. Our findings demonstrate consistent performance across these diverse conditions, showcasing superior generalization capabilities compared to the baseline model, ResNet-50. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11375v1-abstract-full').style.display = 'none'; document.getElementById('2409.11375v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 9 tables, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08443">arXiv:2409.08443</a> <span> [<a href="https://arxiv.org/pdf/2409.08443">pdf</a>, <a href="https://arxiv.org/format/2409.08443">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CF-PRNet: Coarse-to-Fine Prototype Refining Network for Point Cloud Completion and Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+T">Tianqi Wei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zecheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+S">Jia Syuen Lim</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yadan Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xin Yu</a>, <a href="/search/cs?searchtype=author&query=Chapman%2C+S">Scott Chapman</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zi Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08443v1-abstract-short" style="display: inline;"> In modern agriculture, precise monitoring of plants and fruits is crucial for tasks such as high-throughput phenotyping and automated harvesting. This paper addresses the challenge of reconstructing accurate 3D shapes of fruits from partial views, which is common in agricultural settings. We introduce CF-PRNet, a coarse-to-fine prototype refining network, leverages high-resolution 3D data during t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08443v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08443v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08443v1-abstract-full" style="display: none;"> In modern agriculture, precise monitoring of plants and fruits is crucial for tasks such as high-throughput phenotyping and automated harvesting. This paper addresses the challenge of reconstructing accurate 3D shapes of fruits from partial views, which is common in agricultural settings. We introduce CF-PRNet, a coarse-to-fine prototype refining network, leverages high-resolution 3D data during the training phase but requires only a single RGB-D image for real-time inference. Our approach begins by extracting the incomplete point cloud data that constructed from a partial view of a fruit with a series of convolutional blocks. The extracted features inform the generation of scaling vectors that refine two sequentially constructed 3D mesh prototypes - one coarse and one fine-grained. This progressive refinement facilitates the detailed completion of the final point clouds, achieving detailed and accurate reconstructions. CF-PRNet demonstrates excellent performance metrics with a Chamfer Distance of 3.78, an F1 Score of 66.76%, a Precision of 56.56%, and a Recall of 85.31%, and win the first place in the Shape Completion and Reconstruction of Sweet Peppers Challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08443v1-abstract-full').style.display = 'none'; document.getElementById('2409.08443v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report of the 1st place solution to CVPPA@ECCV2024: Shape Completion and Reconstruction of Sweet Peppers Challenge</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16276">arXiv:2408.16276</a> <span> [<a href="https://arxiv.org/pdf/2408.16276">pdf</a>, <a href="https://arxiv.org/ps/2408.16276">ps</a>, <a href="https://arxiv.org/format/2408.16276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Enhancing AI-Driven Psychological Consultation: Layered Prompts with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Souza%2C+R">Rafael Souza</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jia-Hao Lim</a>, <a href="/search/cs?searchtype=author&query=Davis%2C+A">Alexander Davis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16276v1-abstract-short" style="display: inline;"> Psychological consultation is essential for improving mental health and well-being, yet challenges such as the shortage of qualified professionals and scalability issues limit its accessibility. To address these challenges, we explore the use of large language models (LLMs) like GPT-4 to augment psychological consultation services. Our approach introduces a novel layered prompting system that dyna… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16276v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16276v1-abstract-full" style="display: none;"> Psychological consultation is essential for improving mental health and well-being, yet challenges such as the shortage of qualified professionals and scalability issues limit its accessibility. To address these challenges, we explore the use of large language models (LLMs) like GPT-4 to augment psychological consultation services. Our approach introduces a novel layered prompting system that dynamically adapts to user input, enabling comprehensive and relevant information gathering. We also develop empathy-driven and scenario-based prompts to enhance the LLM's emotional intelligence and contextual understanding in therapeutic settings. We validated our approach through experiments using a newly collected dataset of psychological consultation dialogues, demonstrating significant improvements in response quality. The results highlight the potential of our prompt engineering techniques to enhance AI-driven psychological consultation, offering a scalable and accessible solution to meet the growing demand for mental health support. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16276v1-abstract-full').style.display = 'none'; document.getElementById('2408.16276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09111">arXiv:2408.09111</a> <span> [<a href="https://arxiv.org/pdf/2408.09111">pdf</a>, <a href="https://arxiv.org/format/2408.09111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Measuring Agreeableness Bias in Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jaehyuk Lim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B+W">Bruce W. Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09111v2-abstract-short" style="display: inline;"> This paper examines a phenomenon in multimodal language models where pre-marked options in question images can significantly influence model responses. Our study employs a systematic methodology to investigate this effect: we present models with images of multiple-choice questions, which they initially answer correctly, then expose the same model to versions with pre-marked options. Our findings r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09111v2-abstract-full').style.display = 'inline'; document.getElementById('2408.09111v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09111v2-abstract-full" style="display: none;"> This paper examines a phenomenon in multimodal language models where pre-marked options in question images can significantly influence model responses. Our study employs a systematic methodology to investigate this effect: we present models with images of multiple-choice questions, which they initially answer correctly, then expose the same model to versions with pre-marked options. Our findings reveal a significant shift in the models' responses towards the pre-marked option, even when it contradicts their answers in the neutral settings. Comprehensive evaluations demonstrate that this agreeableness bias is a consistent and quantifiable behavior across various model architectures. These results show potential limitations in the reliability of these models when processing images with pre-marked options, raising important questions about their application in critical decision-making contexts where such visual cues might be present. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09111v2-abstract-full').style.display = 'none'; document.getElementById('2408.09111v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08270">arXiv:2408.08270</a> <span> [<a href="https://arxiv.org/pdf/2408.08270">pdf</a>, <a href="https://arxiv.org/format/2408.08270">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HeightLane: BEV Heightmap guided 3D Lane Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+C">Chaesong Park</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+E">Eunbin Seo</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jongwoo Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08270v2-abstract-short" style="display: inline;"> Accurate 3D lane detection from monocular images presents significant challenges due to depth ambiguity and imperfect ground modeling. Previous attempts to model the ground have often used a planar ground assumption with limited degrees of freedom, making them unsuitable for complex road environments with varying slopes. Our study introduces HeightLane, an innovative method that predicts a height… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08270v2-abstract-full').style.display = 'inline'; document.getElementById('2408.08270v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08270v2-abstract-full" style="display: none;"> Accurate 3D lane detection from monocular images presents significant challenges due to depth ambiguity and imperfect ground modeling. Previous attempts to model the ground have often used a planar ground assumption with limited degrees of freedom, making them unsuitable for complex road environments with varying slopes. Our study introduces HeightLane, an innovative method that predicts a height map from monocular images by creating anchors based on a multi-slope assumption. This approach provides a detailed and accurate representation of the ground. HeightLane employs the predicted heightmap along with a deformable attention-based spatial feature transform framework to efficiently convert 2D image features into 3D bird's eye view (BEV) features, enhancing spatial understanding and lane structure recognition. Additionally, the heightmap is used for the positional encoding of BEV features, further improving their spatial accuracy. This explicit view transformation bridges the gap between front-view perceptions and spatially accurate BEV representations, significantly improving detection performance. To address the lack of the necessary ground truth (GT) height map in the original OpenLane dataset, we leverage the Waymo dataset and accumulate its LiDAR data to generate a height map for the drivable area of each scene. The GT heightmaps are used to train the heightmap extraction module from monocular images. Extensive experiments on the OpenLane validation set show that HeightLane achieves state-of-the-art performance in terms of F-score, highlighting its potential in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08270v2-abstract-full').style.display = 'none'; document.getElementById('2408.08270v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05555">arXiv:2408.05555</a> <span> [<a href="https://arxiv.org/pdf/2408.05555">pdf</a>, <a href="https://arxiv.org/format/2408.05555">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model-based Role-Playing for Personalized Medical Jargon Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J+H">Jung Hoon Lim</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+S">Sunjae Kwon</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zonghai Yao</a>, <a href="/search/cs?searchtype=author&query=Lalor%2C+J+P">John P. Lalor</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05555v1-abstract-short" style="display: inline;"> Previous studies reveal that Electronic Health Records (EHR), which have been widely adopted in the U.S. to allow patients to access their personal medical information, do not have high readability to patients due to the prevalence of medical jargon. Tailoring medical notes to individual comprehension by identifying jargon that is difficult for each person will enhance the utility of generative mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05555v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05555v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05555v1-abstract-full" style="display: none;"> Previous studies reveal that Electronic Health Records (EHR), which have been widely adopted in the U.S. to allow patients to access their personal medical information, do not have high readability to patients due to the prevalence of medical jargon. Tailoring medical notes to individual comprehension by identifying jargon that is difficult for each person will enhance the utility of generative models. We present the first quantitative analysis to measure the impact of role-playing in LLM in medical term extraction. By comparing the results of Mechanical Turk workers over 20 sentences, our study demonstrates that LLM role-playing improves F1 scores in 95% of cases across 14 different socio-demographic backgrounds. Furthermore, applying role-playing with in-context learning outperformed the previous state-of-the-art models. Our research showed that ChatGPT can improve traditional medical term extraction systems by utilizing role-play to deliver personalized patient education, a potential that previous models had not achieved. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05555v1-abstract-full').style.display = 'none'; document.getElementById('2408.05555v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 3 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02261">arXiv:2408.02261</a> <span> [<a href="https://arxiv.org/pdf/2408.02261">pdf</a>, <a href="https://arxiv.org/format/2408.02261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-Domain Semantic Segmentation on Inconsistent Taxonomy using VLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jeongkee Lim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yusung Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02261v1-abstract-short" style="display: inline;"> The challenge of semantic segmentation in Unsupervised Domain Adaptation (UDA) emerges not only from domain shifts between source and target images but also from discrepancies in class taxonomies across domains. Traditional UDA research assumes consistent taxonomy between the source and target domains, thereby limiting their ability to recognize and adapt to the taxonomy of the target domain. This… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02261v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02261v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02261v1-abstract-full" style="display: none;"> The challenge of semantic segmentation in Unsupervised Domain Adaptation (UDA) emerges not only from domain shifts between source and target images but also from discrepancies in class taxonomies across domains. Traditional UDA research assumes consistent taxonomy between the source and target domains, thereby limiting their ability to recognize and adapt to the taxonomy of the target domain. This paper introduces a novel approach, Cross-Domain Semantic Segmentation on Inconsistent Taxonomy using Vision Language Models (CSI), which effectively performs domain-adaptive semantic segmentation even in situations of source-target class mismatches. CSI leverages the semantic generalization potential of Visual Language Models (VLMs) to create synergy with previous UDA methods. It leverages segment reasoning obtained through traditional UDA methods, combined with the rich semantic knowledge embedded in VLMs, to relabel new classes in the target domain. This approach allows for effective adaptation to extended taxonomies without requiring any ground truth label for the target domain. Our method has shown to be effective across various benchmarks in situations of inconsistent taxonomy settings (coarse-to-fine taxonomy and open taxonomy) and demonstrates consistent synergy effects when integrated with previous state-of-the-art UDA methods. The implementation is available at http://github.com/jkee58/CSI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02261v1-abstract-full').style.display = 'none'; document.getElementById('2408.02261v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16173">arXiv:2407.16173</a> <span> [<a href="https://arxiv.org/pdf/2407.16173">pdf</a>, <a href="https://arxiv.org/format/2407.16173">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Integrating Meshes and 3D Gaussians for Indoor Scene Reconstruction with SAM Mask Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jiyeop Kim</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jongwoo Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16173v1-abstract-short" style="display: inline;"> We present a novel approach for 3D indoor scene reconstruction that combines 3D Gaussian Splatting (3DGS) with mesh representations. We use meshes for the room layout of the indoor scene, such as walls, ceilings, and floors, while employing 3D Gaussians for other objects. This hybrid approach leverages the strengths of both representations, offering enhanced flexibility and ease of editing. Howeve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16173v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16173v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16173v1-abstract-full" style="display: none;"> We present a novel approach for 3D indoor scene reconstruction that combines 3D Gaussian Splatting (3DGS) with mesh representations. We use meshes for the room layout of the indoor scene, such as walls, ceilings, and floors, while employing 3D Gaussians for other objects. This hybrid approach leverages the strengths of both representations, offering enhanced flexibility and ease of editing. However, joint training of meshes and 3D Gaussians is challenging because it is not clear which primitive should affect which part of the rendered image. Objects close to the room layout often struggle during training, particularly when the room layout is textureless, which can lead to incorrect optimizations and unnecessary 3D Gaussians. To overcome these challenges, we employ Segment Anything Model (SAM) to guide the selection of primitives. The SAM mask loss enforces each instance to be represented by either Gaussians or meshes, ensuring clear separation and stable training. Furthermore, we introduce an additional densification stage without resetting the opacity after the standard densification. This stage mitigates the degradation of image quality caused by a limited number of 3D Gaussians after the standard densification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16173v1-abstract-full').style.display = 'none'; document.getElementById('2407.16173v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04903">arXiv:2407.04903</a> <span> [<a href="https://arxiv.org/pdf/2407.04903">pdf</a>, <a href="https://arxiv.org/format/2407.04903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MMSci: A Dataset for Graduate-Level Multi-Discipline Multimodal Scientific Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zekun Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xianjun Yang</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+K">Kyuri Choi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wanrong Zhu</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+R">Ryan Hsieh</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">HyeonJung Kim</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+H">Jin Hyuk Lim</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Sungyoung Ji</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Byungju Lee</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xifeng Yan</a>, <a href="/search/cs?searchtype=author&query=Petzold%2C+L+R">Linda Ruth Petzold</a>, <a href="/search/cs?searchtype=author&query=Wilson%2C+S+D">Stephen D. Wilson</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+W">Woosang Lim</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W+Y">William Yang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04903v2-abstract-short" style="display: inline;"> The rapid development of Multimodal Large Language Models (MLLMs) is making AI-driven scientific assistants increasingly feasible, with interpreting scientific figures being a crucial task. However, existing datasets and benchmarks focus mainly on basic charts and limited science subjects, lacking comprehensive evaluations. To address this, we curated a multimodal, multidisciplinary dataset from p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04903v2-abstract-full').style.display = 'inline'; document.getElementById('2407.04903v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04903v2-abstract-full" style="display: none;"> The rapid development of Multimodal Large Language Models (MLLMs) is making AI-driven scientific assistants increasingly feasible, with interpreting scientific figures being a crucial task. However, existing datasets and benchmarks focus mainly on basic charts and limited science subjects, lacking comprehensive evaluations. To address this, we curated a multimodal, multidisciplinary dataset from peer-reviewed, open-access Nature Communications articles, spanning 72 scientific disciplines. This dataset includes figures such as schematic diagrams, simulated images, macroscopic/microscopic photos, and experimental visualizations (e.g., western blots), which often require graduate-level, discipline-specific expertise to interpret. We developed benchmarks for scientific figure captioning and multiple-choice questions, evaluating six proprietary and over ten open-source models across varied settings. The results highlight the high difficulty of these tasks and the significant performance gap among models. While many open-source models performed at chance level on the multiple-choice task, some matched the performance of proprietary models. However, the gap was more pronounced in the captioning task. Our dataset also provide valuable resource for training. Fine-tuning the Qwen2-VL-2B model with our task-specific multimodal training data improved its multiple-choice accuracy to a level comparable to GPT-4o, though captioning remains challenging. Continuous pre-training of MLLMs using our interleaved article and figure data enhanced their material generation capabilities, demonstrating potential for integrating scientific knowledge. The dataset and benchmarks will be released to support further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04903v2-abstract-full').style.display = 'none'; document.getElementById('2407.04903v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code and data are available at https://github.com/Leezekun/MMSci</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00649">arXiv:2407.00649</a> <span> [<a href="https://arxiv.org/pdf/2407.00649">pdf</a>, <a href="https://arxiv.org/format/2407.00649">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Particle Semi-Implicit Variational Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J+N">Jen Ning Lim</a>, <a href="/search/cs?searchtype=author&query=Johansen%2C+A+M">Adam M. Johansen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00649v3-abstract-short" style="display: inline;"> Semi-implicit variational inference (SIVI) enriches the expressiveness of variational families by utilizing a kernel and a mixing distribution to hierarchically define the variational distribution. Existing SIVI methods parameterize the mixing distribution using implicit distributions, leading to intractable variational densities. As a result, directly maximizing the evidence lower bound (ELBO) is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00649v3-abstract-full').style.display = 'inline'; document.getElementById('2407.00649v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00649v3-abstract-full" style="display: none;"> Semi-implicit variational inference (SIVI) enriches the expressiveness of variational families by utilizing a kernel and a mixing distribution to hierarchically define the variational distribution. Existing SIVI methods parameterize the mixing distribution using implicit distributions, leading to intractable variational densities. As a result, directly maximizing the evidence lower bound (ELBO) is not possible, so they resort to one of the following: optimizing bounds on the ELBO, employing costly inner-loop Markov chain Monte Carlo runs, or solving minimax objectives. In this paper, we propose a novel method for SIVI called Particle Variational Inference (PVI) which employs empirical measures to approximate the optimal mixing distributions characterized as the minimizer of a free energy functional. PVI arises naturally as a particle approximation of a Euclidean--Wasserstein gradient flow and, unlike prior works, it directly optimizes the ELBO whilst making no parametric assumption about the mixing distribution. Our empirical results demonstrate that PVI performs favourably compared to other SIVI methods across various tasks. Moreover, we provide a theoretical analysis of the behaviour of the gradient flow of a related free energy functional: establishing the existence and uniqueness of solutions as well as propagation of chaos results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00649v3-abstract-full').style.display = 'none'; document.getElementById('2407.00649v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Camera ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17768">arXiv:2406.17768</a> <span> [<a href="https://arxiv.org/pdf/2406.17768">pdf</a>, <a href="https://arxiv.org/format/2406.17768">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EXTRACT: Efficient Policy Learning by Extracting Transferable Robot Skills from Offline Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jesse Zhang</a>, <a href="/search/cs?searchtype=author&query=Heo%2C+M">Minho Heo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Biyik%2C+E">Erdem Biyik</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+J">Joseph J Lim</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yao Liu</a>, <a href="/search/cs?searchtype=author&query=Fakoor%2C+R">Rasool Fakoor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17768v3-abstract-short" style="display: inline;"> Most reinforcement learning (RL) methods focus on learning optimal policies over low-level action spaces. While these methods can perform well in their training environments, they lack the flexibility to transfer to new tasks. Instead, RL agents that can act over useful, temporally extended skills rather than low-level actions can learn new tasks more easily. Prior work in skill-based RL either re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17768v3-abstract-full').style.display = 'inline'; document.getElementById('2406.17768v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17768v3-abstract-full" style="display: none;"> Most reinforcement learning (RL) methods focus on learning optimal policies over low-level action spaces. While these methods can perform well in their training environments, they lack the flexibility to transfer to new tasks. Instead, RL agents that can act over useful, temporally extended skills rather than low-level actions can learn new tasks more easily. Prior work in skill-based RL either requires expert supervision to define useful skills, which is hard to scale, or learns a skill-space from offline data with heuristics that limit the adaptability of the skills, making them difficult to transfer during downstream RL. Our approach, EXTRACT, instead utilizes pre-trained vision language models to extract a discrete set of semantically meaningful skills from offline data, each of which is parameterized by continuous arguments, without human supervision. This skill parameterization allows robots to learn new tasks by only needing to learn when to select a specific skill and how to modify its arguments for the specific task. We demonstrate through experiments in sparse-reward, image-based, robot manipulation environments that EXTRACT can more quickly learn new tasks than prior works, with major gains in sample efficiency and performance over prior skill-based RL. Website at https://www.jessezhang.net/projects/extract/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17768v3-abstract-full').style.display = 'none'; document.getElementById('2406.17768v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 16 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> CoRL 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15527">arXiv:2406.15527</a> <span> [<a href="https://arxiv.org/pdf/2406.15527">pdf</a>, <a href="https://arxiv.org/format/2406.15527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Data Efficient Evaluation of Large Language Models and Text-to-Image Models via Adaptive Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+C">Cong Xu</a>, <a href="/search/cs?searchtype=author&query=Saranathan%2C+G">Gayathri Saranathan</a>, <a href="/search/cs?searchtype=author&query=Alam%2C+M+P">Mahammad Parwez Alam</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+A">Arpit Shah</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">James Lim</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+S+Y">Soon Yee Wong</a>, <a href="/search/cs?searchtype=author&query=Martin%2C+F">Foltin Martin</a>, <a href="/search/cs?searchtype=author&query=Bhattacharya%2C+S">Suparna Bhattacharya</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15527v1-abstract-short" style="display: inline;"> Evaluating LLMs and text-to-image models is a computationally intensive task often overlooked. Efficient evaluation is crucial for understanding the diverse capabilities of these models and enabling comparisons across a growing number of new models and benchmarks. To address this, we introduce SubLIME, a data-efficient evaluation framework that employs adaptive sampling techniques, such as cluster… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15527v1-abstract-full').style.display = 'inline'; document.getElementById('2406.15527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15527v1-abstract-full" style="display: none;"> Evaluating LLMs and text-to-image models is a computationally intensive task often overlooked. Efficient evaluation is crucial for understanding the diverse capabilities of these models and enabling comparisons across a growing number of new models and benchmarks. To address this, we introduce SubLIME, a data-efficient evaluation framework that employs adaptive sampling techniques, such as clustering and quality-based methods, to create representative subsets of benchmarks. Our approach ensures statistically aligned model rankings compared to full datasets, evidenced by high Pearson correlation coefficients. Empirical analysis across six NLP benchmarks reveals that: (1) quality-based sampling consistently achieves strong correlations (0.85 to 0.95) with full datasets at a 10\% sampling rate such as Quality SE and Quality CPD (2) clustering methods excel in specific benchmarks such as MMLU (3) no single method universally outperforms others across all metrics. Extending this framework, we leverage the HEIM leaderboard to cover 25 text-to-image models on 17 different benchmarks. SubLIME dynamically selects the optimal technique for each benchmark, significantly reducing evaluation costs while preserving ranking integrity and score distribution. Notably, a minimal sampling rate of 1% proves effective for benchmarks like MMLU. Additionally, we demonstrate that employing difficulty-based sampling to target more challenging benchmark segments enhances model differentiation with broader score distributions. We also combine semantic search, tool use, and GPT-4 review to identify redundancy across benchmarks within specific LLM categories, such as coding benchmarks. This allows us to further reduce the number of samples needed to maintain targeted rank preservation. Overall, SubLIME offers a versatile and cost-effective solution for the robust evaluation of LLMs and text-to-image models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15527v1-abstract-full').style.display = 'none'; document.getElementById('2406.15527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14924">arXiv:2406.14924</a> <span> [<a href="https://arxiv.org/pdf/2406.14924">pdf</a>, <a href="https://arxiv.org/format/2406.14924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiPEx: Dispersing Prompt Expansion for Class-Agnostic Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J+S">Jia Syuen Lim</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuoxiao Chen</a>, <a href="/search/cs?searchtype=author&query=Baktashmotlagh%2C+M">Mahsa Baktashmotlagh</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xin Yu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zi Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yadan Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14924v2-abstract-short" style="display: inline;"> Class-agnostic object detection (OD) can be a cornerstone or a bottleneck for many downstream vision tasks. Despite considerable advancements in bottom-up and multi-object discovery methods that leverage basic visual cues to identify salient objects, consistently achieving a high recall rate remains difficult due to the diversity of object types and their contextual complexity. In this work, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14924v2-abstract-full').style.display = 'inline'; document.getElementById('2406.14924v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14924v2-abstract-full" style="display: none;"> Class-agnostic object detection (OD) can be a cornerstone or a bottleneck for many downstream vision tasks. Despite considerable advancements in bottom-up and multi-object discovery methods that leverage basic visual cues to identify salient objects, consistently achieving a high recall rate remains difficult due to the diversity of object types and their contextual complexity. In this work, we investigate using vision-language models (VLMs) to enhance object detection via a self-supervised prompt learning strategy. Our initial findings indicate that manually crafted text queries often result in undetected objects, primarily because detection confidence diminishes when the query words exhibit semantic overlap. To address this, we propose a Dispersing Prompt Expansion (DiPEx) approach. DiPEx progressively learns to expand a set of distinct, non-overlapping hyperspherical prompts to enhance recall rates, thereby improving performance in downstream tasks such as out-of-distribution OD. Specifically, DiPEx initiates the process by self-training generic parent prompts and selecting the one with the highest semantic uncertainty for further expansion. The resulting child prompts are expected to inherit semantics from their parent prompts while capturing more fine-grained semantics. We apply dispersion losses to ensure high inter-class discrepancy among child prompts while preserving semantic consistency between parent-child prompt pairs. To prevent excessive growth of the prompt sets, we utilize the maximum angular coverage (MAC) of the semantic space as a criterion for early termination. We demonstrate the effectiveness of DiPEx through extensive class-agnostic OD and OOD-OD experiments on MS-COCO and LVIS, surpassing other prompting methods by up to 20.1\% in AR and achieving a 21.3\% AP improvement over SAM. The code is available at https://github.com/jason-lim26/DiPEx. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14924v2-abstract-full').style.display = 'none'; document.getElementById('2406.14924v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12721">arXiv:2406.12721</a> <span> [<a href="https://arxiv.org/pdf/2406.12721">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Sound event detection based on auxiliary decoder and maximum probability aggregation for DCASE Challenge 2024 Task 4 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Son%2C+S+W">Sang Won Son</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jongyeon Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+K">Hong Kook Kim</a>, <a href="/search/cs?searchtype=author&query=Vesal%2C+S">Sulaiman Vesal</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+E">Jeong Eun Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12721v2-abstract-short" style="display: inline;"> In this report, we propose three novel methods for developing a sound event detection (SED) model for the DCASE 2024 Challenge Task 4. First, we propose an auxiliary decoder attached to the final convolutional block to improve feature extraction capabilities while reducing dependency on embeddings from pre-trained large models. The proposed auxiliary decoder operates independently from the main de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12721v2-abstract-full').style.display = 'inline'; document.getElementById('2406.12721v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12721v2-abstract-full" style="display: none;"> In this report, we propose three novel methods for developing a sound event detection (SED) model for the DCASE 2024 Challenge Task 4. First, we propose an auxiliary decoder attached to the final convolutional block to improve feature extraction capabilities while reducing dependency on embeddings from pre-trained large models. The proposed auxiliary decoder operates independently from the main decoder, enhancing performance of the convolutional block during the initial training stages by assigning a different weight strategy between main and auxiliary decoder losses. Next, to address the time interval issue between the DESED and MAESTRO datasets, we propose maximum probability aggregation (MPA) during the training step. The proposed MPA method enables the model's output to be aligned with soft labels of 1 s in the MAESTRO dataset. Finally, we propose a multi-channel input feature that employs various versions of logmel and MFCC features to generate time-frequency pattern. The experimental results demonstrate the efficacy of these proposed methods in a view of improving SED performance by achieving a balanced enhancement across different datasets and label types. Ultimately, this approach presents a significant step forward in developing more robust and flexible SED models <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12721v2-abstract-full').style.display = 'none'; document.getElementById('2406.12721v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">DCASE 2024 challenge Task4, 4 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10809">arXiv:2406.10809</a> <span> [<a href="https://arxiv.org/pdf/2406.10809">pdf</a>, <a href="https://arxiv.org/format/2406.10809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Post-hoc Utterance Refining Method by Entity Mining for Faithful Knowledge Grounded Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jang%2C+Y">Yoonna Jang</a>, <a href="/search/cs?searchtype=author&query=Son%2C+S">Suhyune Son</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jeongwoo Lee</a>, <a href="/search/cs?searchtype=author&query=Son%2C+J">Junyoung Son</a>, <a href="/search/cs?searchtype=author&query=Hur%2C+Y">Yuna Hur</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jungwoo Lim</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+H">Hyeonseok Moon</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kisu Yang</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+H">Heuiseok Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10809v1-abstract-short" style="display: inline;"> Despite the striking advances in recent language generation performance, model-generated responses have suffered from the chronic problem of hallucinations that are either untrue or unfaithful to a given source. Especially in the task of knowledge grounded conversation, the models are required to generate informative responses, but hallucinated utterances lead to miscommunication. In particular, e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10809v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10809v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10809v1-abstract-full" style="display: none;"> Despite the striking advances in recent language generation performance, model-generated responses have suffered from the chronic problem of hallucinations that are either untrue or unfaithful to a given source. Especially in the task of knowledge grounded conversation, the models are required to generate informative responses, but hallucinated utterances lead to miscommunication. In particular, entity-level hallucination that causes critical misinformation and undesirable conversation is one of the major concerns. To address this issue, we propose a post-hoc refinement method called REM. It aims to enhance the quality and faithfulness of hallucinated utterances by refining them based on the source knowledge. If the generated utterance has a low source-faithfulness score with the given knowledge, REM mines the key entities in the knowledge and implicitly uses them for refining the utterances. We verify that our method reduces entity hallucination in the utterance. Also, we show the adaptability and efficacy of REM with extensive experiments and generative results. Our code is available at https://github.com/YOONNAJANG/REM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10809v1-abstract-full').style.display = 'none'; document.getElementById('2406.10809v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08528">arXiv:2406.08528</a> <span> [<a href="https://arxiv.org/pdf/2406.08528">pdf</a>, <a href="https://arxiv.org/format/2406.08528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Teaching with Shared Classifier for Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jang%2C+J">Jaeyeon Jang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Young-Ik Kim</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jisu Lim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hyeonseong Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08528v2-abstract-short" style="display: inline;"> Knowledge distillation (KD) is a technique used to transfer knowledge from an overparameterized teacher network to a less-parameterized student network, thereby minimizing the incurred performance loss. KD methods can be categorized into offline and online approaches. Offline KD leverages a powerful pretrained teacher network, while online KD allows the teacher network to be adjusted dynamically t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08528v2-abstract-full').style.display = 'inline'; document.getElementById('2406.08528v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08528v2-abstract-full" style="display: none;"> Knowledge distillation (KD) is a technique used to transfer knowledge from an overparameterized teacher network to a less-parameterized student network, thereby minimizing the incurred performance loss. KD methods can be categorized into offline and online approaches. Offline KD leverages a powerful pretrained teacher network, while online KD allows the teacher network to be adjusted dynamically to enhance the learning effectiveness of the student network. Recently, it has been discovered that sharing the classifier of the teacher network can significantly boost the performance of the student network with only a minimal increase in the number of network parameters. Building on these insights, we propose adaptive teaching with a shared classifier (ATSC). In ATSC, the pretrained teacher network self-adjusts to better align with the learning needs of the student network based on its capabilities, and the student network benefits from the shared classifier, enhancing its performance. Additionally, we extend ATSC to environments with multiple teachers. We conduct extensive experiments, demonstrating the effectiveness of the proposed KD method. Our approach achieves state-of-the-art results on the CIFAR-100 and ImageNet datasets in both single-teacher and multiteacher scenarios, with only a modest increase in the number of required model parameters. The source code is publicly available at https://github.com/random2314235/ATSC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08528v2-abstract-full').style.display = 'none'; document.getElementById('2406.08528v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06367">arXiv:2406.06367</a> <span> [<a href="https://arxiv.org/pdf/2406.06367">pdf</a>, <a href="https://arxiv.org/format/2406.06367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MVGamba: Unify 3D Content Generation as State Space Sequence Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xuanyu Yi</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zike Wu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Q">Qiuhong Shen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qingshan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Joo-Hwee Lim</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shuicheng Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanwang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06367v3-abstract-short" style="display: inline;"> Recent 3D large reconstruction models (LRMs) can generate high-quality 3D content in sub-seconds by integrating multi-view diffusion models with scalable multi-view reconstructors. Current works further leverage 3D Gaussian Splatting as 3D representation for improved visual quality and rendering efficiency. However, we observe that existing Gaussian reconstruction models often suffer from multi-vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06367v3-abstract-full').style.display = 'inline'; document.getElementById('2406.06367v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06367v3-abstract-full" style="display: none;"> Recent 3D large reconstruction models (LRMs) can generate high-quality 3D content in sub-seconds by integrating multi-view diffusion models with scalable multi-view reconstructors. Current works further leverage 3D Gaussian Splatting as 3D representation for improved visual quality and rendering efficiency. However, we observe that existing Gaussian reconstruction models often suffer from multi-view inconsistency and blurred textures. We attribute this to the compromise of multi-view information propagation in favor of adopting powerful yet computationally intensive architectures (e.g., Transformers). To address this issue, we introduce MVGamba, a general and lightweight Gaussian reconstruction model featuring a multi-view Gaussian reconstructor based on the RNN-like State Space Model (SSM). Our Gaussian reconstructor propagates causal context containing multi-view information for cross-view self-refinement while generating a long sequence of Gaussians for fine-detail modeling with linear complexity. With off-the-shelf multi-view diffusion models integrated, MVGamba unifies 3D generation tasks from a single image, sparse images, or text prompts. Extensive experiments demonstrate that MVGamba outperforms state-of-the-art baselines in all 3D content generation scenarios with approximately only $0.1\times$ of the model size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06367v3-abstract-full').style.display = 'none'; document.getElementById('2406.06367v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024. Code is included in https://github.com/SkyworkAI/MVGamba</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05313">arXiv:2406.05313</a> <span> [<a href="https://arxiv.org/pdf/2406.05313">pdf</a>, <a href="https://arxiv.org/format/2406.05313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Traversing Mars: Cooperative Informative Path Planning to Efficiently Navigate Unknown Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rockenbauer%2C+F+M">Friedrich M. Rockenbauer</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jaeyoung Lim</a>, <a href="/search/cs?searchtype=author&query=M%C3%BCller%2C+M+G">Marcus G. M眉ller</a>, <a href="/search/cs?searchtype=author&query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&query=Schmid%2C+L">Lukas Schmid</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05313v2-abstract-short" style="display: inline;"> The ability to traverse an unknown environment is crucial for autonomous robot operations. However, due to the limited sensing capabilities and system constraints, approaching this problem with a single robot agent can be slow, costly, and unsafe. For example, in planetary exploration missions, the wear on the wheels of a rover from abrasive terrain should be minimized at all costs as reparations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05313v2-abstract-full').style.display = 'inline'; document.getElementById('2406.05313v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05313v2-abstract-full" style="display: none;"> The ability to traverse an unknown environment is crucial for autonomous robot operations. However, due to the limited sensing capabilities and system constraints, approaching this problem with a single robot agent can be slow, costly, and unsafe. For example, in planetary exploration missions, the wear on the wheels of a rover from abrasive terrain should be minimized at all costs as reparations are infeasible. On the other hand, utilizing a scouting robot such as a micro aerial vehicle (MAV) has the potential to reduce wear and time costs and increasing safety of a follower robot. This work proposes a novel cooperative IPP framework that allows a scout (e.g., an MAV) to efficiently explore the minimum-cost-path for a follower (e.g., a rover) to reach the goal. We derive theoretic guarantees for our algorithm, and prove that the algorithm always terminates, always finds the optimal path if it exists, and terminates early when the found path is shown to be optimal or infeasible. We show in thorough experimental evaluation that the guarantees hold in practice, and that our algorithm is 22.5% quicker to find the optimal path and 15% quicker to terminate compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05313v2-abstract-full').style.display = 'none'; document.getElementById('2406.05313v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 9 figures, code will be available at https://github.com/ethz-asl/scouting-ipp</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01915">arXiv:2406.01915</a> <span> [<a href="https://arxiv.org/pdf/2406.01915">pdf</a>, <a href="https://arxiv.org/format/2406.01915">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Human-Robot Collaborative Assembly in Manufacturing Systems Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jonghan Lim</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+S">Sujani Patel</a>, <a href="/search/cs?searchtype=author&query=Evans%2C+A">Alex Evans</a>, <a href="/search/cs?searchtype=author&query=Pimley%2C+J">John Pimley</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifei Li</a>, <a href="/search/cs?searchtype=author&query=Kovalenko%2C+I">Ilya Kovalenko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01915v2-abstract-short" style="display: inline;"> The development of human-robot collaboration has the ability to improve manufacturing system performance by leveraging the unique strengths of both humans and robots. On the shop floor, human operators contribute with their adaptability and flexibility in dynamic situations, while robots provide precision and the ability to perform repetitive tasks. However, the communication gap between human ope… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01915v2-abstract-full').style.display = 'inline'; document.getElementById('2406.01915v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01915v2-abstract-full" style="display: none;"> The development of human-robot collaboration has the ability to improve manufacturing system performance by leveraging the unique strengths of both humans and robots. On the shop floor, human operators contribute with their adaptability and flexibility in dynamic situations, while robots provide precision and the ability to perform repetitive tasks. However, the communication gap between human operators and robots limits the collaboration and coordination of human-robot teams in manufacturing systems. Our research presents a human-robot collaborative assembly framework that utilizes a large language model for enhancing communication in manufacturing environments. The framework facilitates human-robot communication by integrating voice commands through natural language for task management. A case study for an assembly task demonstrates the framework's ability to process natural language inputs and address real-time assembly challenges, emphasizing adaptability to language variation and efficiency in error resolution. The results suggest that large language models have the potential to improve human-robot interaction for collaborative manufacturing assembly applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01915v2-abstract-full').style.display = 'none'; document.getElementById('2406.01915v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01893">arXiv:2406.01893</a> <span> [<a href="https://arxiv.org/pdf/2406.01893">pdf</a>, <a href="https://arxiv.org/format/2406.01893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model-Enabled Multi-Agent Manufacturing Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jonghan Lim</a>, <a href="/search/cs?searchtype=author&query=Vogel-Heuser%2C+B">Birgit Vogel-Heuser</a>, <a href="/search/cs?searchtype=author&query=Kovalenko%2C+I">Ilya Kovalenko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01893v2-abstract-short" style="display: inline;"> Traditional manufacturing faces challenges adapting to dynamic environments and quickly responding to manufacturing changes. The use of multi-agent systems has improved adaptability and coordination but requires further advancements in rapid human instruction comprehension, operational adaptability, and coordination through natural language integration. Large language models like GPT-3.5 and GPT-4… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01893v2-abstract-full').style.display = 'inline'; document.getElementById('2406.01893v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01893v2-abstract-full" style="display: none;"> Traditional manufacturing faces challenges adapting to dynamic environments and quickly responding to manufacturing changes. The use of multi-agent systems has improved adaptability and coordination but requires further advancements in rapid human instruction comprehension, operational adaptability, and coordination through natural language integration. Large language models like GPT-3.5 and GPT-4 enhance multi-agent manufacturing systems by enabling agents to communicate in natural language and interpret human instructions for decision-making. This research introduces a novel framework where large language models enhance the capabilities of agents in manufacturing, making them more adaptable, and capable of processing context-specific instructions. A case study demonstrates the practical application of this framework, showing how agents can effectively communicate, understand tasks, and execute manufacturing processes, including precise G-code allocation among agents. The findings highlight the importance of continuous large language model integration into multi-agent manufacturing systems and the development of sophisticated agent communication protocols for a more flexible manufacturing system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01893v2-abstract-full').style.display = 'none'; document.getElementById('2406.01893v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20602">arXiv:2405.20602</a> <span> [<a href="https://arxiv.org/pdf/2405.20602">pdf</a>, <a href="https://arxiv.org/format/2405.20602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Masked Language Modeling Becomes Conditional Density Estimation for Tabular Data Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+S">Seunghwan An</a>, <a href="/search/cs?searchtype=author&query=Woo%2C+G">Gyeongdong Woo</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jaesung Lim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+C">ChangHyun Kim</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+S">Sungchul Hong</a>, <a href="/search/cs?searchtype=author&query=Jeon%2C+J">Jong-June Jeon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20602v2-abstract-short" style="display: inline;"> In this paper, our goal is to generate synthetic data for heterogeneous (mixed-type) tabular datasets with high machine learning utility (MLu). Since the MLu performance depends on accurately approximating the conditional distributions, we focus on devising a synthetic data generation method based on conditional distribution estimation. We introduce MaCoDE by redefining the consecutive multi-class… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20602v2-abstract-full').style.display = 'inline'; document.getElementById('2405.20602v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20602v2-abstract-full" style="display: none;"> In this paper, our goal is to generate synthetic data for heterogeneous (mixed-type) tabular datasets with high machine learning utility (MLu). Since the MLu performance depends on accurately approximating the conditional distributions, we focus on devising a synthetic data generation method based on conditional distribution estimation. We introduce MaCoDE by redefining the consecutive multi-class classification task of Masked Language Modeling (MLM) as histogram-based non-parametric conditional density estimation. Our approach enables the estimation of conditional densities across arbitrary combinations of target and conditional variables. We bridge the theoretical gap between distributional learning and MLM by demonstrating that minimizing the orderless multi-class classification loss leads to minimizing the total variation distance between conditional distributions. To validate our proposed model, we evaluate its performance in synthetic data generation across 10 real-world datasets, demonstrating its ability to adjust data privacy levels easily without re-training. Additionally, since masked input tokens in MLM are analogous to missing data, we further assess its effectiveness in handling training datasets with missing values, including multiple imputations of the missing entries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20602v2-abstract-full').style.display = 'none'; document.getElementById('2405.20602v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18027">arXiv:2405.18027</a> <span> [<a href="https://arxiv.org/pdf/2405.18027">pdf</a>, <a href="https://arxiv.org/format/2405.18027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TimeChara: Evaluating Point-in-Time Character Hallucination of Role-Playing Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ahn%2C+J">Jaewoo Ahn</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+T">Taehyun Lee</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Junyoung Lim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jin-Hwa Kim</a>, <a href="/search/cs?searchtype=author&query=Yun%2C+S">Sangdoo Yun</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hwaran Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+G">Gunhee Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18027v1-abstract-short" style="display: inline;"> While Large Language Models (LLMs) can serve as agents to simulate human behaviors (i.e., role-playing agents), we emphasize the importance of point-in-time role-playing. This situates characters at specific moments in the narrative progression for three main reasons: (i) enhancing users' narrative immersion, (ii) avoiding spoilers, and (iii) fostering engagement in fandom role-playing. To accurat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18027v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18027v1-abstract-full" style="display: none;"> While Large Language Models (LLMs) can serve as agents to simulate human behaviors (i.e., role-playing agents), we emphasize the importance of point-in-time role-playing. This situates characters at specific moments in the narrative progression for three main reasons: (i) enhancing users' narrative immersion, (ii) avoiding spoilers, and (iii) fostering engagement in fandom role-playing. To accurately represent characters at specific time points, agents must avoid character hallucination, where they display knowledge that contradicts their characters' identities and historical timelines. We introduce TimeChara, a new benchmark designed to evaluate point-in-time character hallucination in role-playing LLMs. Comprising 10,895 instances generated through an automated pipeline, this benchmark reveals significant hallucination issues in current state-of-the-art LLMs (e.g., GPT-4o). To counter this challenge, we propose Narrative-Experts, a method that decomposes the reasoning steps and utilizes narrative experts to reduce point-in-time character hallucinations effectively. Still, our findings with TimeChara highlight the ongoing challenges of point-in-time character hallucination, calling for further study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18027v1-abstract-full').style.display = 'none'; document.getElementById('2405.18027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 Findings. Code and dataset are released at https://ahnjaewoo.github.io/timechara</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16496">arXiv:2405.16496</a> <span> [<a href="https://arxiv.org/pdf/2405.16496">pdf</a>, <a href="https://arxiv.org/format/2405.16496">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring a Multimodal Fusion-based Deep Learning Network for Detecting Facial Palsy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oo%2C+N+H+Y">Nicole Heng Yim Oo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M+H">Min Hun Lee</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+H">Jeong Hoon Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16496v1-abstract-short" style="display: inline;"> Algorithmic detection of facial palsy offers the potential to improve current practices, which usually involve labor-intensive and subjective assessment by clinicians. In this paper, we present a multimodal fusion-based deep learning model that utilizes unstructured data (i.e. an image frame with facial line segments) and structured data (i.e. features of facial expressions) to detect facial palsy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16496v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16496v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16496v1-abstract-full" style="display: none;"> Algorithmic detection of facial palsy offers the potential to improve current practices, which usually involve labor-intensive and subjective assessment by clinicians. In this paper, we present a multimodal fusion-based deep learning model that utilizes unstructured data (i.e. an image frame with facial line segments) and structured data (i.e. features of facial expressions) to detect facial palsy. We then contribute to a study to analyze the effect of different data modalities and the benefits of a multimodal fusion-based approach using videos of 21 facial palsy patients. Our experimental results show that among various data modalities (i.e. unstructured data - RGB images and images of facial line segments and structured data - coordinates of facial landmarks and features of facial expressions), the feed-forward neural network using features of facial expression achieved the highest precision of 76.22 while the ResNet-based model using images of facial line segments achieved the highest recall of 83.47. When we leveraged both images of facial line segments and features of facial expressions, our multimodal fusion-based deep learning model slightly improved the precision score to 77.05 at the expense of a decrease in the recall score. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16496v1-abstract-full').style.display = 'none'; document.getElementById('2405.16496v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.12538">arXiv:2405.12538</a> <span> [<a href="https://arxiv.org/pdf/2405.12538">pdf</a>, <a href="https://arxiv.org/format/2405.12538">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Intent Gap: Knowledge-Enhanced Visual Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Ziwei Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dongyun Lin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Harry Cheng</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+Y">Yongkang Wong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Ying Sun</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+H">Joo Hwee Lim</a>, <a href="/search/cs?searchtype=author&query=Kankanhalli%2C+M">Mohan Kankanhalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.12538v1-abstract-short" style="display: inline;"> For visual content generation, discrepancies between user intentions and the generated content have been a longstanding problem. This discrepancy arises from two main factors. First, user intentions are inherently complex, with subtle details not fully captured by input prompts. The absence of such details makes it challenging for generative models to accurately reflect the intended meaning, leadi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12538v1-abstract-full').style.display = 'inline'; document.getElementById('2405.12538v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.12538v1-abstract-full" style="display: none;"> For visual content generation, discrepancies between user intentions and the generated content have been a longstanding problem. This discrepancy arises from two main factors. First, user intentions are inherently complex, with subtle details not fully captured by input prompts. The absence of such details makes it challenging for generative models to accurately reflect the intended meaning, leading to a mismatch between the desired and generated output. Second, generative models trained on visual-label pairs lack the comprehensive knowledge to accurately represent all aspects of the input data in their generated outputs. To address these challenges, we propose a knowledge-enhanced iterative refinement framework for visual content generation. We begin by analyzing and identifying the key challenges faced by existing generative models. Then, we introduce various knowledge sources, including human insights, pre-trained models, logic rules, and world knowledge, which can be leveraged to address these challenges. Furthermore, we propose a novel visual generation framework that incorporates a knowledge-based feedback module to iteratively refine the generation process. This module gradually improves the alignment between the generated content and user intentions. We demonstrate the efficacy of the proposed framework through preliminary results, highlighting the potential of knowledge-enhanced generative models for intention-aligned content generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12538v1-abstract-full').style.display = 'none'; document.getElementById('2405.12538v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02011">arXiv:2405.02011</a> <span> [<a href="https://arxiv.org/pdf/2405.02011">pdf</a>, <a href="https://arxiv.org/format/2405.02011">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Autonomous Active Mapping in Steep Alpine Environments with Fixed-wing Aerial Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jaeyoung Lim</a>, <a href="/search/cs?searchtype=author&query=Achermann%2C+F">Florian Achermann</a>, <a href="/search/cs?searchtype=author&query=Lawrance%2C+N">Nicholas Lawrance</a>, <a href="/search/cs?searchtype=author&query=Siegwart%2C+R">Roland Siegwart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02011v1-abstract-short" style="display: inline;"> Monitoring large scale environments is a crucial task for managing remote alpine environments, especially for hazardous events such as avalanches. One key information for avalanche risk forecast is imagery of released avalanches. As these happen in remote and potentially dangerous locations this data is difficult to obtain. Fixed-wing vehicles, due to their long range and travel speeds are a promi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02011v1-abstract-full').style.display = 'inline'; document.getElementById('2405.02011v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02011v1-abstract-full" style="display: none;"> Monitoring large scale environments is a crucial task for managing remote alpine environments, especially for hazardous events such as avalanches. One key information for avalanche risk forecast is imagery of released avalanches. As these happen in remote and potentially dangerous locations this data is difficult to obtain. Fixed-wing vehicles, due to their long range and travel speeds are a promising platform to gather aerial imagery to map avalanche activities. However, operating such vehicles in mountainous terrain remains a challenge due to the complex topography, regulations, and uncertain environment. In this work, we present a system that is capable of safely navigating and mapping an avalanche using a fixed-wing aerial system and discuss the challenges arising when executing such a mission. We show in our field experiments that we can effectively navigate in steep terrain environments while maximizing the map quality. We expect our work to enable more autonomous operations of fixed-wing vehicles in alpine environments to maximize the quality of the data gathered. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02011v1-abstract-full').style.display = 'none'; document.getElementById('2405.02011v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures, Accepted to the IEEE ICRA Workshop on Field Robotics 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16504">arXiv:2404.16504</a> <span> [<a href="https://arxiv.org/pdf/2404.16504">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Hardware Implementation of Double Pendulum Pseudo Random Number Generator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jarrod Lim</a>, <a href="/search/cs?searchtype=author&query=Piccio%2C+T+M+O">Tom Manuel Opalla Piccio</a>, <a href="/search/cs?searchtype=author&query=Michelle%2C+C+M+J">Chua Min Jie Michelle</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+M">Maoyang Xiang</a>, <a href="/search/cs?searchtype=author&query=Teo%2C+T+H">T. Hui Teo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16504v1-abstract-short" style="display: inline;"> The objective of this project is to utilize an FPGA board which is the CMOD A7 35t to obtain a pseudo random number which can be used for encryption. We aim to achieve this by leveraging the inherent randomness present in environmental data captured by sensors. This data will be used as a seed to initialize an algorithm implemented on the CMOD A7 35t FPGA board. The project will focus on interfaci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16504v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16504v1-abstract-full" style="display: none;"> The objective of this project is to utilize an FPGA board which is the CMOD A7 35t to obtain a pseudo random number which can be used for encryption. We aim to achieve this by leveraging the inherent randomness present in environmental data captured by sensors. This data will be used as a seed to initialize an algorithm implemented on the CMOD A7 35t FPGA board. The project will focus on interfacing the sensors with the FPGA and developing suitable algorithms to ensure the generated numbers exhibit strong randomness properties. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16504v1-abstract-full').style.display = 'none'; document.getElementById('2404.16504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 12 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16133">arXiv:2404.16133</a> <span> [<a href="https://arxiv.org/pdf/2404.16133">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Quantitative Characterization of Retinal Features in Translated OCTA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Badhon%2C+R+H">Rashadul Hasan Badhon</a>, <a href="/search/cs?searchtype=author&query=Thompson%2C+A+C">Atalie Carina Thompson</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+I">Jennifer I. Lim</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+T">Theodore Leng</a>, <a href="/search/cs?searchtype=author&query=Alam%2C+M+N">Minhaj Nur Alam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16133v1-abstract-short" style="display: inline;"> Purpose: This study explores the feasibility of using generative machine learning (ML) to translate Optical Coherence Tomography (OCT) images into Optical Coherence Tomography Angiography (OCTA) images, potentially bypassing the need for specialized OCTA hardware. Methods: The method involved implementing a generative adversarial network framework that includes a 2D vascular segmentation model and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16133v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16133v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16133v1-abstract-full" style="display: none;"> Purpose: This study explores the feasibility of using generative machine learning (ML) to translate Optical Coherence Tomography (OCT) images into Optical Coherence Tomography Angiography (OCTA) images, potentially bypassing the need for specialized OCTA hardware. Methods: The method involved implementing a generative adversarial network framework that includes a 2D vascular segmentation model and a 2D OCTA image translation model. The study utilizes a public dataset of 500 patients, divided into subsets based on resolution and disease status, to validate the quality of TR-OCTA images. The validation employs several quality and quantitative metrics to compare the translated images with ground truth OCTAs (GT-OCTA). We then quantitatively characterize vascular features generated in TR-OCTAs with GT-OCTAs to assess the feasibility of using TR-OCTA for objective disease diagnosis. Result: TR-OCTAs showed high image quality in both 3 and 6 mm datasets (high-resolution, moderate structural similarity and contrast quality compared to GT-OCTAs). There were slight discrepancies in vascular metrics, especially in diseased patients. Blood vessel features like tortuosity and vessel perimeter index showed a better trend compared to density features which are affected by local vascular distortions. Conclusion: This study presents a promising solution to the limitations of OCTA adoption in clinical practice by using vascular features from TR-OCTA for disease detection. Translation relevance: This study has the potential to significantly enhance the diagnostic process for retinal diseases by making detailed vascular imaging more widely available and reducing dependency on costly OCTA equipment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16133v1-abstract-full').style.display = 'none'; document.getElementById('2404.16133v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The article has been revised and edited</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08847">arXiv:2404.08847</a> <span> [<a href="https://arxiv.org/pdf/2404.08847">pdf</a>, <a href="https://arxiv.org/format/2404.08847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3620665.3640384">10.1145/3620665.3640384 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> LazyDP: Co-Designing Algorithm-Software for Scalable Training of Differentially Private Recommendation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+J">Juntaek Lim</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+Y">Youngeun Kwon</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+R">Ranggi Hwang</a>, <a href="/search/cs?searchtype=author&query=Maeng%2C+K">Kiwan Maeng</a>, <a href="/search/cs?searchtype=author&query=Suh%2C+G+E">G. Edward Suh</a>, <a href="/search/cs?searchtype=author&query=Rhu%2C+M">Minsoo Rhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08847v1-abstract-short" style="display: inline;"> Differential privacy (DP) is widely being employed in the industry as a practical standard for privacy protection. While private training of computer vision or natural language processing applications has been studied extensively, the computational challenges of training of recommender systems (RecSys) with DP have not been explored. In this work, we first present our detailed characterization of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08847v1-abstract-full').style.display = 'inline'; document.getElementById('2404.08847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08847v1-abstract-full" style="display: none;"> Differential privacy (DP) is widely being employed in the industry as a practical standard for privacy protection. While private training of computer vision or natural language processing applications has been studied extensively, the computational challenges of training of recommender systems (RecSys) with DP have not been explored. In this work, we first present our detailed characterization of private RecSys training using DP-SGD, root-causing its several performance bottlenecks. Specifically, we identify DP-SGD's noise sampling and noisy gradient update stage to suffer from a severe compute and memory bandwidth limitation, respectively, causing significant performance overhead in training private RecSys. Based on these findings, we propose LazyDP, an algorithm-software co-design that addresses the compute and memory challenges of training RecSys with DP-SGD. Compared to a state-of-the-art DP-SGD training system, we demonstrate that LazyDP provides an average 119x training throughput improvement while also ensuring mathematically equivalent, differentially private RecSys models to be trained. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08847v1-abstract-full').style.display = 'none'; document.getElementById('2404.08847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Published at 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-29), 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04562">arXiv:2404.04562</a> <span> [<a href="https://arxiv.org/pdf/2404.04562">pdf</a>, <a href="https://arxiv.org/format/2404.04562">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Time-step Curriculum for One Image to 3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xuanyu Yi</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zike Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qingshan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Joo-Hwee Lim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanwang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04562v3-abstract-short" style="display: inline;"> Score distillation sampling~(SDS) has been widely adopted to overcome the absence of unseen views in reconstructing 3D objects from a \textbf{single} image. It leverages pre-trained 2D diffusion models as teacher to guide the reconstruction of student 3D models. Despite their remarkable success, SDS-based methods often encounter geometric artifacts and texture saturation. We find out the crux is t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04562v3-abstract-full').style.display = 'inline'; document.getElementById('2404.04562v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04562v3-abstract-full" style="display: none;"> Score distillation sampling~(SDS) has been widely adopted to overcome the absence of unseen views in reconstructing 3D objects from a \textbf{single} image. It leverages pre-trained 2D diffusion models as teacher to guide the reconstruction of student 3D models. Despite their remarkable success, SDS-based methods often encounter geometric artifacts and texture saturation. We find out the crux is the overlooked indiscriminate treatment of diffusion time-steps during optimization: it unreasonably treats the student-teacher knowledge distillation to be equal at all time-steps and thus entangles coarse-grained and fine-grained modeling. Therefore, we propose the Diffusion Time-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the teacher and student models collaborating with the time-step curriculum in a coarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and Level50 benchmark demonstrate that DTC123 can produce multi-view consistent, high-quality, and diverse 3D assets. Codes and more generation demos will be released in https://github.com/yxymessi/DTC123. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04562v3-abstract-full').style.display = 'none'; document.getElementById('2404.04562v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.02077">arXiv:2404.02077</a> <span> [<a href="https://arxiv.org/pdf/2404.02077">pdf</a>, <a href="https://arxiv.org/format/2404.02077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Energy-Optimized Planning in Non-Uniform Wind Fields with Fixed-Wing Aerial Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yufei Duan</a>, <a href="/search/cs?searchtype=author&query=Achermann%2C+F">Florian Achermann</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jaeyoung Lim</a>, <a href="/search/cs?searchtype=author&query=Siegwart%2C+R">Roland Siegwart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.02077v2-abstract-short" style="display: inline;"> Fixed-wing small uncrewed aerial vehicles (sUAVs) possess the capability to remain airborne for extended durations and traverse vast distances. However, their operation is susceptible to wind conditions, particularly in regions of complex terrain where high wind speeds may push the aircraft beyond its operational limitations, potentially raising safety concerns. Moreover, wind impacts the energy r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02077v2-abstract-full').style.display = 'inline'; document.getElementById('2404.02077v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.02077v2-abstract-full" style="display: none;"> Fixed-wing small uncrewed aerial vehicles (sUAVs) possess the capability to remain airborne for extended durations and traverse vast distances. However, their operation is susceptible to wind conditions, particularly in regions of complex terrain where high wind speeds may push the aircraft beyond its operational limitations, potentially raising safety concerns. Moreover, wind impacts the energy required to follow a path, especially in locations where the wind direction and speed are not favorable. Incorporating wind information into mission planning is essential to ensure both safety and energy efficiency. In this paper, we propose a sampling-based planner using the kinematic Dubins aircraft paths with respect to the ground, to plan energy-efficient paths in non-uniform wind fields. We study the planner characteristics with synthetic and real-world wind data and compare its performance against baseline cost and path formulations. We demonstrate that the energy-optimized planner effectively utilizes updrafts to minimize energy consumption, albeit at the expense of increased travel time. The ground-relative path formulation facilitates the generation of safe trajectories onboard sUAVs within reasonable computational timeframes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02077v2-abstract-full').style.display = 'none'; document.getElementById('2404.02077v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">\c{opyright} 2024 IEEE. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material for advertising or promotional purposes, creating new collective works, for resale or redistribution to servers or lists, or reuse of any copyrighted component of this work in other works</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lim%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Lim%2C+J&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>