CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 170 results for author: <span class="mathjax">Miao, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Miao%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Miao, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Miao%2C+Y&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Miao, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Miao%2C+Y&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Miao%2C+Y&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Miao%2C+Y&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Miao%2C+Y&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Miao%2C+Y&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16027">arXiv:2411.16027</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16027">pdf</a>, <a href="https://arxiv.org/format/2411.16027">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> From Dashcam Videos to Driving Simulations: Stress Testing Automated Vehicles against Rare Events </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yan Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Fainekos%2C+G">Georgios Fainekos</a>, <a href="/search/cs?searchtype=author&amp;query=Hoxha%2C+B">Bardh Hoxha</a>, <a href="/search/cs?searchtype=author&amp;query=Okamoto%2C+H">Hideki Okamoto</a>, <a href="/search/cs?searchtype=author&amp;query=Prokhorov%2C+D">Danil Prokhorov</a>, <a href="/search/cs?searchtype=author&amp;query=Mitra%2C+S">Sayan Mitra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16027v1-abstract-short" style="display: inline;"> Testing Automated Driving Systems (ADS) in simulation with realistic driving scenarios is important for verifying their performance. However, converting real-world driving videos into simulation scenarios is a significant challenge due to the complexity of interpreting high-dimensional video data and the time-consuming nature of precise manual scenario reconstruction. In this work, we propose a no&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16027v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16027v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16027v1-abstract-full" style="display: none;"> Testing Automated Driving Systems (ADS) in simulation with realistic driving scenarios is important for verifying their performance. However, converting real-world driving videos into simulation scenarios is a significant challenge due to the complexity of interpreting high-dimensional video data and the time-consuming nature of precise manual scenario reconstruction. In this work, we propose a novel framework that automates the conversion of real-world car crash videos into detailed simulation scenarios for ADS testing. Our approach leverages prompt-engineered Video Language Models(VLM) to transform dashcam footage into SCENIC scripts, which define the environment and driving behaviors in the CARLA simulator, enabling the generation of realistic simulation scenarios. Importantly, rather than solely aiming for one-to-one scenario reconstruction, our framework focuses on capturing the essential driving behaviors from the original video while offering flexibility in parameters such as weather or road conditions to facilitate search-based testing. Additionally, we introduce a similarity metric that helps iteratively refine the generated scenario through feedback by comparing key features of driving behaviors between the real and simulated videos. Our preliminary results demonstrate substantial time efficiency, finishing the real-to-sim conversion in minutes with full automation and no human intervention, while maintaining high fidelity to the original driving events. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16027v1-abstract-full').style.display = 'none'; document.getElementById('2411.16027v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11798">arXiv:2411.11798</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11798">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> COST CA20120 INTERACT Framework of Artificial Intelligence Based Channel Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=He%2C+R">Ruisi He</a>, <a href="/search/cs?searchtype=author&amp;query=Cicco%2C+N+D">Nicola D. Cicco</a>, <a href="/search/cs?searchtype=author&amp;query=Ai%2C+B">Bo Ai</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+M">Mi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yang Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Boban%2C+M">Mate Boban</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11798v1-abstract-short" style="display: inline;"> Accurate channel models are the prerequisite for communication-theoretic investigations as well as system design. Channel modeling generally relies on statistical and deterministic approaches. However, there are still significant limits for the traditional modeling methods in terms of accuracy, generalization ability, and computational complexity. The fundamental reason is that establishing a quan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11798v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11798v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11798v1-abstract-full" style="display: none;"> Accurate channel models are the prerequisite for communication-theoretic investigations as well as system design. Channel modeling generally relies on statistical and deterministic approaches. However, there are still significant limits for the traditional modeling methods in terms of accuracy, generalization ability, and computational complexity. The fundamental reason is that establishing a quantified and accurate mapping between physical environment and channel characteristics becomes increasing challenging for modern communication systems. Here, in the context of COST CA20120 Action, we evaluate and discuss the feasibility and implementation of using artificial intelligence (AI) for channel modeling, and explore where the future of this field lies. Firstly, we present a framework of AI-based channel modeling to characterize complex wireless channels. Then, we highlight in detail some major challenges and present the possible solutions: i) estimating the uncertainty of AI-based channel predictions, ii) integrating prior knowledge of propagation to improve generalization capabilities, and iii) interpretable AI for channel modeling. We present and discuss illustrative numerical results to showcase the capabilities of AI-based channel modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11798v1-abstract-full').style.display = 'none'; document.getElementById('2411.11798v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to appear in IEEE Wireless Communications Magazine</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08045">arXiv:2411.08045</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08045">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Popular Physics">physics.pop-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Audience Reach of Scientific Data Visualizations in Planetarium-Screened Films </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Borkiewicz%2C+K">Kalina Borkiewicz</a>, <a href="/search/cs?searchtype=author&amp;query=Jensen%2C+E">Eric Jensen</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yiwen Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Levy%2C+S">Stuart Levy</a>, <a href="/search/cs?searchtype=author&amp;query=Naiman%2C+J+P">J. P. Naiman</a>, <a href="/search/cs?searchtype=author&amp;query=Carpenter%2C+J">Jeff Carpenter</a>, <a href="/search/cs?searchtype=author&amp;query=Isaacs%2C+K+E">Katherine E. Isaacs</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08045v1-abstract-short" style="display: inline;"> Quantifying the global reach of planetarium dome shows presents significant challenges due to the lack of standardized viewership tracking mechanisms across diverse planetarium venues. We present an analysis of the global impact of dome shows, presenting data regarding four documentary films from a single visualization lab. Specifically, we designed and administered a viewership survey of four lon&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08045v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08045v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08045v1-abstract-full" style="display: none;"> Quantifying the global reach of planetarium dome shows presents significant challenges due to the lack of standardized viewership tracking mechanisms across diverse planetarium venues. We present an analysis of the global impact of dome shows, presenting data regarding four documentary films from a single visualization lab. Specifically, we designed and administered a viewership survey of four long-running shows that contained cinematic scientific visualizations. Reported survey data shows that between 1.2 - 2.6 million people have viewed these four films across the 68 responding planetariums (mean: 1.9 million). When we include estimates and extrapolate for the 315 planetariums that licensed these shows, we arrive at an estimate of 16.5 - 24.1 million people having seen these films (mean: 20.3 million). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08045v1-abstract-full').style.display = 'none'; document.getElementById('2411.08045v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00372">arXiv:2411.00372</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00372">pdf</a>, <a href="https://arxiv.org/ps/2411.00372">ps</a>, <a href="https://arxiv.org/format/2411.00372">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generalizability of Memorization Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+L">Lijia Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+X">Xiao-Shan Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lijun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00372v1-abstract-short" style="display: inline;"> The neural network memorization problem is to study the expressive power of neural networks to interpolate a finite dataset. Although memorization is widely believed to have a close relationship with the strong generalizability of deep learning when using over-parameterized models, to the best of our knowledge, there exists no theoretical study on the generalizability of memorization neural networ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00372v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00372v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00372v1-abstract-full" style="display: none;"> The neural network memorization problem is to study the expressive power of neural networks to interpolate a finite dataset. Although memorization is widely believed to have a close relationship with the strong generalizability of deep learning when using over-parameterized models, to the best of our knowledge, there exists no theoretical study on the generalizability of memorization neural networks. In this paper, we give the first theoretical analysis of this topic. Since using i.i.d. training data is a necessary condition for a learning algorithm to be generalizable, memorization and its generalization theory for i.i.d. datasets are developed under mild conditions on the data distribution. First, algorithms are given to construct memorization networks for an i.i.d. dataset, which have the smallest number of parameters and even a constant number of parameters. Second, we show that, in order for the memorization networks to be generalizable, the width of the network must be at least equal to the dimension of the data, which implies that the existing memorization networks with an optimal number of parameters are not generalizable. Third, a lower bound for the sample complexity of general memorization algorithms and the exact sample complexity for memorization algorithms with constant number of parameters are given. It is also shown that there exist data distributions such that, to be generalizable for them, the memorization network must have an exponential number of parameters in the data dimension. Finally, an efficient and generalizable memorization algorithm is given when the number of training samples is greater than the efficient memorization sample complexity of the data distribution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00372v1-abstract-full').style.display = 'none'; document.getElementById('2411.00372v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18585">arXiv:2410.18585</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18585">pdf</a>, <a href="https://arxiv.org/format/2410.18585">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Aligning CodeLLMs with Direct Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+B">Bofei Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Quan%2C+S">Shanghaoran Quan</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+J">Junyang Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zan%2C+D">Daoguang Zan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Z">Zhijie Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18585v1-abstract-short" style="display: inline;"> The last year has witnessed the rapid progress of large language models (LLMs) across diverse domains. Among them, CodeLLMs have garnered particular attention because they can not only assist in completing various programming tasks but also represent the decision-making and logical reasoning capabilities of LLMs. However, current CodeLLMs mainly focus on pre-training and supervised fine-tuning sce&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18585v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18585v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18585v1-abstract-full" style="display: none;"> The last year has witnessed the rapid progress of large language models (LLMs) across diverse domains. Among them, CodeLLMs have garnered particular attention because they can not only assist in completing various programming tasks but also represent the decision-making and logical reasoning capabilities of LLMs. However, current CodeLLMs mainly focus on pre-training and supervised fine-tuning scenarios, leaving the alignment stage, which is important for post-training LLMs, under-explored. This work first identifies that the commonly used PPO algorithm may be suboptimal for the alignment of CodeLLM because the involved reward rules are routinely coarse-grained and potentially flawed. We then advocate addressing this using the DPO algorithm. Based on only preference data pairs, DPO can render the model rank data automatically, giving rise to a fine-grained rewarding pattern more robust than human intervention. We also contribute a pipeline for collecting preference pairs for DPO on CodeLLMs. Studies show that our method significantly improves the performance of existing CodeLLMs on benchmarks such as MBPP and HumanEval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18585v1-abstract-full').style.display = 'none'; document.getElementById('2410.18585v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10190">arXiv:2410.10190</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10190">pdf</a>, <a href="https://arxiv.org/format/2410.10190">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Predicting from Strings: Language Model Embeddings for Bayesian Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Tung Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiuyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Bangding Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+C">Chansoo Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Bornschein%2C+J">Jorg Bornschein</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yingjie Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Perel%2C+S">Sagi Perel</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yutian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+X">Xingyou Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10190v2-abstract-short" style="display: inline;"> Bayesian Optimization is ubiquitous in the field of experimental design and blackbox optimization for improving search efficiency, but has been traditionally restricted to regression models which are only applicable to fixed search spaces and tabular input features. We propose Embed-then-Regress, a paradigm for applying in-context regression over string inputs, through the use of string embedding&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10190v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10190v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10190v2-abstract-full" style="display: none;"> Bayesian Optimization is ubiquitous in the field of experimental design and blackbox optimization for improving search efficiency, but has been traditionally restricted to regression models which are only applicable to fixed search spaces and tabular input features. We propose Embed-then-Regress, a paradigm for applying in-context regression over string inputs, through the use of string embedding capabilities of pretrained language models. By expressing all inputs as strings, we are able to perform general-purpose regression for Bayesian Optimization over various domains including synthetic, combinatorial, and hyperparameter optimization, obtaining comparable results to state-of-the-art Gaussian Process-based algorithms. Code can be found at https://github.com/google-research/optformer/tree/main/optformer/embed_then_regress. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10190v2-abstract-full').style.display = 'none'; document.getElementById('2410.10190v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07985">arXiv:2410.07985</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07985">pdf</a>, <a href="https://arxiv.org/format/2410.07985">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Omni-MATH: A Universal Olympiad Level Mathematic Benchmark For Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+B">Bofei Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+F">Feifan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+C">Chenghao Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Z">Zhengyang Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zan%2C+D">Daoguang Zan</a>, <a href="/search/cs?searchtype=author&amp;query=Quan%2C+S">Shanghaoran Quan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sha%2C+L">Lei Sha</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yichang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+X">Xuancheng Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+B">Baobao Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07985v2-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have led to significant breakthroughs in mathematical reasoning capabilities. However, existing benchmarks like GSM8K or MATH are now being solved with high accuracy (e.g., OpenAI o1 achieves 94.8% on MATH dataset), indicating their inadequacy for truly challenging these models. To bridge this gap, we propose a comprehensive and challenging bench&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07985v2-abstract-full').style.display = 'inline'; document.getElementById('2410.07985v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07985v2-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have led to significant breakthroughs in mathematical reasoning capabilities. However, existing benchmarks like GSM8K or MATH are now being solved with high accuracy (e.g., OpenAI o1 achieves 94.8% on MATH dataset), indicating their inadequacy for truly challenging these models. To bridge this gap, we propose a comprehensive and challenging benchmark specifically designed to assess LLMs&#39; mathematical reasoning at the Olympiad level. Unlike existing Olympiad-related benchmarks, our dataset focuses exclusively on mathematics and comprises a vast collection of 4428 competition-level problems with rigorous human annotation. These problems are meticulously categorized into over 33 sub-domains and span more than 10 distinct difficulty levels, enabling a holistic assessment of model performance in Olympiad-mathematical reasoning. Furthermore, we conducted an in-depth analysis based on this benchmark. Our experimental results show that even the most advanced models, OpenAI o1-mini and OpenAI o1-preview, struggle with highly challenging Olympiad-level problems, with 60.54% and 52.55% accuracy, highlighting significant challenges in Olympiad-level mathematical reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07985v2-abstract-full').style.display = 'none'; document.getElementById('2410.07985v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 Pages, 17 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04366">arXiv:2410.04366</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04366">pdf</a>, <a href="https://arxiv.org/format/2410.04366">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> RespDiff: An End-to-End Multi-scale RNN Diffusion Model for Respiratory Waveform Estimation from PPG Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yuyang Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zehua Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Mandic%2C+D">Danilo Mandic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04366v1-abstract-short" style="display: inline;"> Respiratory rate (RR) is a critical health indicator often monitored under inconvenient scenarios, limiting its practicality for continuous monitoring. Photoplethysmography (PPG) sensors, increasingly integrated into wearable devices, offer a chance to continuously estimate RR in a portable manner. In this paper, we propose RespDiff, an end-to-end multi-scale RNN diffusion model for respiratory wa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04366v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04366v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04366v1-abstract-full" style="display: none;"> Respiratory rate (RR) is a critical health indicator often monitored under inconvenient scenarios, limiting its practicality for continuous monitoring. Photoplethysmography (PPG) sensors, increasingly integrated into wearable devices, offer a chance to continuously estimate RR in a portable manner. In this paper, we propose RespDiff, an end-to-end multi-scale RNN diffusion model for respiratory waveform estimation from PPG signals. RespDiff does not require hand-crafted features or the exclusion of low-quality signal segments, making it suitable for real-world scenarios. The model employs multi-scale encoders, to extract features at different resolutions, and a bidirectional RNN to process PPG signals and extract respiratory waveform. Additionally, a spectral loss term is introduced to optimize the model further. Experiments conducted on the BIDMC dataset demonstrate that RespDiff outperforms notable previous works, achieving a mean absolute error (MAE) of 1.18 bpm for RR estimation while others range from 1.66 to 2.15 bpm, showing its potential for robust and accurate respiratory monitoring in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04366v1-abstract-full').style.display = 'none'; document.getElementById('2410.04366v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20291">arXiv:2409.20291</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.20291">pdf</a>, <a href="https://arxiv.org/format/2409.20291">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RL-GSBridge: 3D Gaussian Splatting Based Real2Sim2Real Method for Robotic Manipulation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yuxuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+L">Lei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+W">Wenhua Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guangming Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yanzi Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hesheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20291v1-abstract-short" style="display: inline;"> Sim-to-Real refers to the process of transferring policies learned in simulation to the real world, which is crucial for achieving practical robotics applications. However, recent Sim2real methods either rely on a large amount of augmented data or large learning models, which is inefficient for specific tasks. In recent years, radiance field-based reconstruction methods, especially the emergence o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20291v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20291v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20291v1-abstract-full" style="display: none;"> Sim-to-Real refers to the process of transferring policies learned in simulation to the real world, which is crucial for achieving practical robotics applications. However, recent Sim2real methods either rely on a large amount of augmented data or large learning models, which is inefficient for specific tasks. In recent years, radiance field-based reconstruction methods, especially the emergence of 3D Gaussian Splatting, making it possible to reproduce realistic real-world scenarios. To this end, we propose a novel real-to-sim-to-real reinforcement learning framework, RL-GSBridge, which introduces a mesh-based 3D Gaussian Splatting method to realize zero-shot sim-to-real transfer for vision-based deep reinforcement learning. We improve the mesh-based 3D GS modeling method by using soft binding constraints, enhancing the rendering quality of mesh models. We then employ a GS editing approach to synchronize rendering with the physics simulator, reflecting the interactions of the physical robot more accurately. Through a series of sim-to-real robotic arm experiments, including grasping and pick-and-place tasks, we demonstrate that RL-GSBridge maintains a satisfactory success rate in real-world task completion during sim-to-real transfer. Furthermore, a series of rendering metrics and visualization results indicate that our proposed mesh-based 3D Gaussian reduces artifacts in unstructured objects, demonstrating more realistic rendering performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20291v1-abstract-full').style.display = 'none'; document.getElementById('2409.20291v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures, 4 tables, under review by ICRA2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12186">arXiv:2409.12186</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.12186">pdf</a>, <a href="https://arxiv.org/format/2409.12186">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Qwen2.5-Coder Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hui%2C+B">Binyuan Hui</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Z">Zeyu Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jiaxi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Dayiheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiajun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Dang%2C+K">Kai Dang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+Y">Yang Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yichang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+A">An Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Men%2C+R">Rui Men</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+B">Bo Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Quan%2C+S">Shanghaoran Quan</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+Y">Yunlong Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+X">Xingzhang Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+X">Xuancheng Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12186v3-abstract-short" style="display: inline;"> In this report, we introduce the Qwen2.5-Coder series, a significant upgrade from its predecessor, CodeQwen1.5. This series includes six models: Qwen2.5-Coder-(0.5B/1.5B/3B/7B/14B/32B). As a code-specific model, Qwen2.5-Coder is built upon the Qwen2.5 architecture and continues pretrained on a vast corpus of over 5.5 trillion tokens. Through meticulous data cleaning, scalable synthetic data genera&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12186v3-abstract-full').style.display = 'inline'; document.getElementById('2409.12186v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12186v3-abstract-full" style="display: none;"> In this report, we introduce the Qwen2.5-Coder series, a significant upgrade from its predecessor, CodeQwen1.5. This series includes six models: Qwen2.5-Coder-(0.5B/1.5B/3B/7B/14B/32B). As a code-specific model, Qwen2.5-Coder is built upon the Qwen2.5 architecture and continues pretrained on a vast corpus of over 5.5 trillion tokens. Through meticulous data cleaning, scalable synthetic data generation, and balanced data mixing, Qwen2.5-Coder demonstrates impressive code generation capabilities while retaining general and math skills. These models have been evaluated on a wide range of code-related tasks, achieving state-of-the-art (SOTA) performance across more than 10 benchmarks, including code generation, completion, reasoning, and repair, consistently outperforming larger models of the same model size. We believe that the release of the Qwen2.5-Coder series will advance research in code intelligence and, with its permissive licensing, support wider adoption by developers in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12186v3-abstract-full').style.display = 'none'; document.getElementById('2409.12186v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03878">arXiv:2409.03878</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.03878">pdf</a>, <a href="https://arxiv.org/format/2409.03878">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Geophysics">physics.geo-ph</span> </div> </div> <p class="title is-5 mathjax"> Ground-roll Separation From Land Seismic Records Based on Convolutional Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jia%2C+Z">Zhuang Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+W">Wenkai Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yongkang Miao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03878v1-abstract-short" style="display: inline;"> Ground-roll wave is a common coherent noise in land field seismic data. This Rayleigh-type surface wave usually has low frequency, low apparent velocity, and high amplitude, therefore obscures the reflection events of seismic shot gathers. Commonly used techniques focus on the differences of ground-roll and reflection in transformed domain such as $f-k$ domain, wavelet domain, or curvelet domain.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03878v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03878v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03878v1-abstract-full" style="display: none;"> Ground-roll wave is a common coherent noise in land field seismic data. This Rayleigh-type surface wave usually has low frequency, low apparent velocity, and high amplitude, therefore obscures the reflection events of seismic shot gathers. Commonly used techniques focus on the differences of ground-roll and reflection in transformed domain such as $f-k$ domain, wavelet domain, or curvelet domain. These approaches use a series of fixed atoms or bases to transform the data in time-space domain into transformed domain to separate different waveforms, thus tend to suffer from the complexity for a delicate design of the parameters of the transform domain filter. To deal with these problems, a novel way is proposed to separate ground-roll from reflections using convolutional neural network (CNN) model based method to learn to extract the features of ground-roll and reflections automatically based on training data. In the proposed method, low-pass filtered seismic data which is contaminated by ground-roll wave is used as input of CNN, and then outputs both ground-roll component and low-frequency part of reflection component simultaneously. Discriminative loss is applied together with similarity loss in the training process to enhance the similarity to their train labels as well as the difference between the two outputs. Experiments are conducted on both synthetic and real data, showing that CNN based method can separate ground roll from reflections effectively, and has generalization ability to a certain extent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03878v1-abstract-full').style.display = 'none'; document.getElementById('2409.03878v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03393">arXiv:2409.03393</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.03393">pdf</a>, <a href="https://arxiv.org/format/2409.03393">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> VQ-DeepVSC: A Dual-Stage Vector Quantization Framework for Video Semantic Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yongyi Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhongdang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+D">Die Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+J">Jun Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Youfang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03393v1-abstract-short" style="display: inline;"> In response to the rapid growth of global videomtraffic and the limitations of traditional wireless transmission systems, we propose a novel dual-stage vector quantization framework, VQ-DeepVSC, tailored to enhance video transmission over wireless channels. In the first stage, we design the adaptive keyframe extractor and interpolator, deployed respectively at the transmitter and receiver, which i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03393v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03393v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03393v1-abstract-full" style="display: none;"> In response to the rapid growth of global videomtraffic and the limitations of traditional wireless transmission systems, we propose a novel dual-stage vector quantization framework, VQ-DeepVSC, tailored to enhance video transmission over wireless channels. In the first stage, we design the adaptive keyframe extractor and interpolator, deployed respectively at the transmitter and receiver, which intelligently select key frames to minimize inter-frame redundancy and mitigate the cliff-effect under challenging channel conditions. In the second stage, we propose the semantic vector quantization encoder and decoder, placed respectively at the transmitter and receiver, which efficiently compress key frames using advanced indexing and spatial normalization modules to reduce redundancy. Additionally, we propose adjustable index selection and recovery modules, enhancing compression efficiency and enabling flexible compression ratio adjustment. Compared to the joint source-channel coding (JSCC) framework, the proposed framework exhibits superior compatibility with current digital communication systems. Experimental results demonstrate that VQ-DeepVSC achieves substantial improvements in both Multi-Scale Structural Similarity (MS-SSIM) and Learned Perceptual Image Patch Similarity (LPIPS) metrics than the H.265 standard, particularly under low channel signal-to-noise ratio (SNR) or multi-path channels, highlighting the significantly enhanced transmission capabilities of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03393v1-abstract-full').style.display = 'none'; document.getElementById('2409.03393v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02795">arXiv:2409.02795</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.02795">pdf</a>, <a href="https://arxiv.org/format/2409.02795">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards a Unified View of Preference Learning for Large Language Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+B">Bofei Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+F">Feifan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Helan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+C">Ce Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Quan%2C+S">Shanghaoran Quan</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+W">Wen Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zan%2C+D">Daoguang Zan</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Dayiheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Z">Zeyu Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Sha%2C+L">Lei Sha</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Houfeng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sui%2C+Z">Zhifang Sui</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+B">Baobao Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02795v5-abstract-short" style="display: inline;"> Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of the crucial factors to achieve success is aligning the LLM&#39;s output with human preferences. This alignment process often requires only a small amount of data to efficiently enhance the LLM&#39;s performance. While effective, research in this area spans multiple domains, and the methods involved are relatively complex to unde&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02795v5-abstract-full').style.display = 'inline'; document.getElementById('2409.02795v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02795v5-abstract-full" style="display: none;"> Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of the crucial factors to achieve success is aligning the LLM&#39;s output with human preferences. This alignment process often requires only a small amount of data to efficiently enhance the LLM&#39;s performance. While effective, research in this area spans multiple domains, and the methods involved are relatively complex to understand. The relationships between different methods have been under-explored, limiting the development of the preference alignment. In light of this, we break down the existing popular alignment strategies into different components and provide a unified framework to study the current alignment strategies, thereby establishing connections among them. In this survey, we decompose all the strategies in preference learning into four components: model, data, feedback, and algorithm. This unified view offers an in-depth understanding of existing alignment algorithms and also opens up possibilities to synergize the strengths of different strategies. Furthermore, we present detailed working examples of prevalent existing algorithms to facilitate a comprehensive understanding for the readers. Finally, based on our unified perspective, we explore the challenges and future research directions for aligning large language models with human preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02795v5-abstract-full').style.display = 'none'; document.getElementById('2409.02795v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00922">arXiv:2409.00922</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.00922">pdf</a>, <a href="https://arxiv.org/format/2409.00922">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3658644.3690231">10.1145/3658644.3690231 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> ProphetFuzz: Fully Automated Prediction and Fuzzing of High-Risk Option Combinations with Only Documentation via Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Dawei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+G">Geng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yukai Miao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00922v1-abstract-short" style="display: inline;"> Vulnerabilities related to option combinations pose a significant challenge in software security testing due to their vast search space. Previous research primarily addressed this challenge through mutation or filtering techniques, which inefficiently treated all option combinations as having equal potential for vulnerabilities, thus wasting considerable time on non-vulnerable targets and resultin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00922v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00922v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00922v1-abstract-full" style="display: none;"> Vulnerabilities related to option combinations pose a significant challenge in software security testing due to their vast search space. Previous research primarily addressed this challenge through mutation or filtering techniques, which inefficiently treated all option combinations as having equal potential for vulnerabilities, thus wasting considerable time on non-vulnerable targets and resulting in low testing efficiency. In this paper, we utilize carefully designed prompt engineering to drive the large language model (LLM) to predict high-risk option combinations (i.e., more likely to contain vulnerabilities) and perform fuzz testing automatically without human intervention. We developed a tool called ProphetFuzz and evaluated it on a dataset comprising 52 programs collected from three related studies. The entire experiment consumed 10.44 CPU years. ProphetFuzz successfully predicted 1748 high-risk option combinations at an average cost of only \$8.69 per program. Results show that after 72 hours of fuzzing, ProphetFuzz discovered 364 unique vulnerabilities associated with 12.30\% of the predicted high-risk option combinations, which was 32.85\% higher than that found by state-of-the-art in the same timeframe. Additionally, using ProphetFuzz, we conducted persistent fuzzing on the latest versions of these programs, uncovering 140 vulnerabilities, with 93 confirmed by developers and 21 awarded CVE numbers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00922v1-abstract-full').style.display = 'none'; document.getElementById('2409.00922v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12164">arXiv:2407.12164</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.12164">pdf</a>, <a href="https://arxiv.org/format/2407.12164">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Subject-driven Text-to-Image Generation via Preference-based Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yanting Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Loh%2C+W">William Loh</a>, <a href="/search/cs?searchtype=author&amp;query=Kothawade%2C+S">Suraj Kothawade</a>, <a href="/search/cs?searchtype=author&amp;query=Poupart%2C+P">Pascal Poupart</a>, <a href="/search/cs?searchtype=author&amp;query=Rashwan%2C+A">Abdullah Rashwan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yeqing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12164v2-abstract-short" style="display: inline;"> Text-to-image generative models have recently attracted considerable interest, enabling the synthesis of high-quality images from textual prompts. However, these models often lack the capability to generate specific subjects from given reference images or to synthesize novel renditions under varying conditions. Methods like DreamBooth and Subject-driven Text-to-Image (SuTI) have made significant p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12164v2-abstract-full').style.display = 'inline'; document.getElementById('2407.12164v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12164v2-abstract-full" style="display: none;"> Text-to-image generative models have recently attracted considerable interest, enabling the synthesis of high-quality images from textual prompts. However, these models often lack the capability to generate specific subjects from given reference images or to synthesize novel renditions under varying conditions. Methods like DreamBooth and Subject-driven Text-to-Image (SuTI) have made significant progress in this area. Yet, both approaches primarily focus on enhancing similarity to reference images and require expensive setups, often overlooking the need for efficient training and avoiding overfitting to the reference images. In this work, we present the $位$-Harmonic reward function, which provides a reliable reward signal and enables early stopping for faster training and effective regularization. By combining the Bradley-Terry preference model, the $位$-Harmonic reward function also provides preference labels for subject-driven generation tasks. We propose Reward Preference Optimization (RPO), which offers a simpler setup (requiring only $3\%$ of the negative samples used by DreamBooth) and fewer gradient steps for fine-tuning. Unlike most existing methods, our approach does not require training a text encoder or optimizing text embeddings and achieves text-image alignment by fine-tuning only the U-Net component. Empirically, $位$-Harmonic proves to be a reliable approach for model selection in subject-driven generation tasks. Based on preference labels and early stopping validation from the $位$-Harmonic reward function, our algorithm achieves a state-of-the-art CLIP-I score of 0.833 and a CLIP-T score of 0.314 on DreamBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12164v2-abstract-full').style.display = 'none'; document.getElementById('2407.12164v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11011">arXiv:2407.11011</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.11011">pdf</a>, <a href="https://arxiv.org/format/2407.11011">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Toward Availability Attacks in 3D Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yifan Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yinpeng Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+X">Xiao-Shan Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11011v1-abstract-short" style="display: inline;"> Despite the great progress of 3D vision, data privacy and security issues in 3D deep learning are not explored systematically. In the domain of 2D images, many availability attacks have been proposed to prevent data from being illicitly learned by unauthorized deep models. However, unlike images represented on a fixed dimensional grid, point clouds are characterized as unordered and unstructured s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11011v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11011v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11011v1-abstract-full" style="display: none;"> Despite the great progress of 3D vision, data privacy and security issues in 3D deep learning are not explored systematically. In the domain of 2D images, many availability attacks have been proposed to prevent data from being illicitly learned by unauthorized deep models. However, unlike images represented on a fixed dimensional grid, point clouds are characterized as unordered and unstructured sets, posing a significant challenge in designing an effective availability attack for 3D deep learning. In this paper, we theoretically show that extending 2D availability attacks directly to 3D point clouds under distance regularization is susceptible to the degeneracy, rendering the generated poisons weaker or even ineffective. This is because in bi-level optimization, introducing regularization term can result in update directions out of control. To address this issue, we propose a novel Feature Collision Error-Minimization (FC-EM) method, which creates additional shortcuts in the feature space, inducing different update directions to prevent the degeneracy of bi-level optimization. Moreover, we provide a theoretical analysis that demonstrates the effectiveness of the FC-EM attack. Extensive experiments on typical point cloud datasets, 3D intracranial aneurysm medical dataset, and 3D face dataset verify the superiority and practicality of our approach. Code is available at https://github.com/hala64/fc-em. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11011v1-abstract-full').style.display = 'none'; document.getElementById('2407.11011v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024, 21 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07959">arXiv:2407.07959</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.07959">pdf</a>, <a href="https://arxiv.org/format/2407.07959">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Source Code Summarization in the Era of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+W">Weisong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yun Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuekang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hongyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+C">Chunrong Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+G">Gelei Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhenyu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07959v1-abstract-short" style="display: inline;"> To support software developers in understanding and maintaining programs, various automatic (source) code summarization techniques have been proposed to generate a concise natural language summary (i.e., comment) for a given code snippet. Recently, the emergence of large language models (LLMs) has led to a great boost in the performance of code-related tasks. In this paper, we undertake a systemat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07959v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07959v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07959v1-abstract-full" style="display: none;"> To support software developers in understanding and maintaining programs, various automatic (source) code summarization techniques have been proposed to generate a concise natural language summary (i.e., comment) for a given code snippet. Recently, the emergence of large language models (LLMs) has led to a great boost in the performance of code-related tasks. In this paper, we undertake a systematic and comprehensive study on code summarization in the era of LLMs, which covers multiple aspects involved in the workflow of LLM-based code summarization. Specifically, we begin by examining prevalent automated evaluation methods for assessing the quality of summaries generated by LLMs and find that the results of the GPT-4 evaluation method are most closely aligned with human evaluation. Then, we explore the effectiveness of five prompting techniques (zero-shot, few-shot, chain-of-thought, critique, and expert) in adapting LLMs to code summarization tasks. Contrary to expectations, advanced prompting techniques may not outperform simple zero-shot prompting. Next, we investigate the impact of LLMs&#39; model settings (including top\_p and temperature parameters) on the quality of generated summaries. We find the impact of the two parameters on summary quality varies by the base LLM and programming language, but their impacts are similar. Moreover, we canvass LLMs&#39; abilities to summarize code snippets in distinct types of programming languages. The results reveal that LLMs perform suboptimally when summarizing code written in logic programming languages compared to other language types. Finally, we unexpectedly find that CodeLlama-Instruct with 7B parameters can outperform advanced GPT-4 in generating summaries describing code implementation details and asserting code properties. We hope that our findings can provide a comprehensive understanding of code summarization in the era of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07959v1-abstract-full').style.display = 'none'; document.getElementById('2407.07959v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Just accepted to the 47th International Conference on Software Engineering (ICSE 2025)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68-04 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.3; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05965">arXiv:2407.05965</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.05965">pdf</a>, <a href="https://arxiv.org/format/2407.05965">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> T2VSafetyBench: Evaluating the Safety of Text-to-Video Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yifan Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yinpeng Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+L">Lijia Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+X">Xiao-Shan Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05965v3-abstract-short" style="display: inline;"> The recent development of Sora leads to a new era in text-to-video (T2V) generation. Along with this comes the rising concern about its security risks. The generated videos may contain illegal or unethical content, and there is a lack of comprehensive quantitative understanding of their safety, posing a challenge to their reliability and practical deployment. Previous evaluations primarily focus o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05965v3-abstract-full').style.display = 'inline'; document.getElementById('2407.05965v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05965v3-abstract-full" style="display: none;"> The recent development of Sora leads to a new era in text-to-video (T2V) generation. Along with this comes the rising concern about its security risks. The generated videos may contain illegal or unethical content, and there is a lack of comprehensive quantitative understanding of their safety, posing a challenge to their reliability and practical deployment. Previous evaluations primarily focus on the quality of video generation. While some evaluations of text-to-image models have considered safety, they cover fewer aspects and do not address the unique temporal risk inherent in video generation. To bridge this research gap, we introduce T2VSafetyBench, a new benchmark designed for conducting safety-critical assessments of text-to-video models. We define 12 critical aspects of video generation safety and construct a malicious prompt dataset including real-world prompts, LLM-generated prompts and jailbreak attack-based prompts. Based on our evaluation results, we draw several important findings, including: 1) no single model excels in all aspects, with different models showing various strengths; 2) the correlation between GPT-4 assessments and manual reviews is generally high; 3) there is a trade-off between the usability and safety of text-to-video generative models. This indicates that as the field of video generation rapidly advances, safety risks are set to surge, highlighting the urgency of prioritizing video safety. We hope that T2VSafetyBench can provide insights for better understanding the safety of video generation in the era of generative AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05965v3-abstract-full').style.display = 'none'; document.getElementById('2407.05965v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04999">arXiv:2407.04999</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04999">pdf</a>, <a href="https://arxiv.org/format/2407.04999">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.24963/ijcai.2024/237">10.24963/ijcai.2024/237 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Rethinking the Effectiveness of Graph Classification Datasets in Benchmarks for Assessing GNNs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhengdao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Y">Yong Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Shuai%2C+K">Kefan Shuai</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yiming Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Hwang%2C+K">Kai Hwang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04999v1-abstract-short" style="display: inline;"> Graph classification benchmarks, vital for assessing and developing graph neural networks (GNNs), have recently been scrutinized, as simple methods like MLPs have demonstrated comparable performance. This leads to an important question: Do these benchmarks effectively distinguish the advancements of GNNs over other methodologies? If so, how do we quantitatively measure this effectiveness? In respo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04999v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04999v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04999v1-abstract-full" style="display: none;"> Graph classification benchmarks, vital for assessing and developing graph neural networks (GNNs), have recently been scrutinized, as simple methods like MLPs have demonstrated comparable performance. This leads to an important question: Do these benchmarks effectively distinguish the advancements of GNNs over other methodologies? If so, how do we quantitatively measure this effectiveness? In response, we first propose an empirical protocol based on a fair benchmarking framework to investigate the performance discrepancy between simple methods and GNNs. We further propose a novel metric to quantify the dataset effectiveness by considering both dataset complexity and model performance. To the best of our knowledge, our work is the first to thoroughly study and provide an explicit definition for dataset effectiveness in the graph learning area. Through testing across 16 real-world datasets, we found our metric to align with existing studies and intuitive assumptions. Finally, we explore the causes behind the low effectiveness of certain datasets by investigating the correlation between intrinsic graph properties and class labels, and we developed a novel technique supporting the correlation-controllable synthetic dataset generation. Our findings shed light on the current understanding of benchmark datasets, and our new platform could fuel the future evolution of graph classification benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04999v1-abstract-full').style.display = 'none'; document.getElementById('2407.04999v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00072">arXiv:2407.00072</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.00072">pdf</a>, <a href="https://arxiv.org/format/2407.00072">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Pistis-RAG: Enhancing Retrieval-Augmented Generation with Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yu Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yukai Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Dawei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+Y">Yanyu Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+H">Hongtao Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Ce Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+X">Xuhui Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00072v5-abstract-short" style="display: inline;"> RAG systems face limitations when semantic relevance alone does not guarantee improved generation quality. This issue becomes particularly evident due to the sensitivity of large language models (LLMs) to the ordering of few-shot prompts, which can affect model performance. To address this challenge, aligning LLM outputs with human preferences using structured feedback, such as options to copy, re&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00072v5-abstract-full').style.display = 'inline'; document.getElementById('2407.00072v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00072v5-abstract-full" style="display: none;"> RAG systems face limitations when semantic relevance alone does not guarantee improved generation quality. This issue becomes particularly evident due to the sensitivity of large language models (LLMs) to the ordering of few-shot prompts, which can affect model performance. To address this challenge, aligning LLM outputs with human preferences using structured feedback, such as options to copy, regenerate, or dislike, offers a promising method for improvement. This feedback is applied to the entire list of inputs rather than giving specific ratings for individual documents, making it a Listwide Labels Learning-to-Rank task. To address this task, we propose Pistis-RAG, a new RAG framework designed with a content-centric approach to better align LLMs with human preferences. Pistis-RAG effectively utilizes human feedback, enhancing content ranking and generation quality. To validate our framework, we use public datasets to simulate human feedback, allowing us to evaluate and refine our method effectively. Experimental results indicate that Pistis-RAG improves alignment with human preferences relative to the baseline RAG system, showing a 6.06% increase in MMLU (English) and a 7.08% increase in C-EVAL (Chinese) accuracy metrics. These results highlight Pistis-RAG&#39;s effectiveness in overcoming the limitations associated with traditional RAG approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00072v5-abstract-full').style.display = 'none'; document.getElementById('2407.00072v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13233">arXiv:2406.13233</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13233">pdf</a>, <a href="https://arxiv.org/format/2406.13233">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AdaMoE: Token-Adaptive Routing with Null Experts for Mixture-of-Experts Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Z">Zihao Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+H">Hongcheng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Z">Zhijie Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13233v2-abstract-short" style="display: inline;"> Mixture of experts (MoE) has become the standard for constructing production-level large language models (LLMs) due to its promise to boost model capacity without causing significant overheads. Nevertheless, existing MoE methods usually enforce a constant top-k routing for all tokens, which is arguably restrictive because various tokens (e.g., &#34;&lt;EOS&gt;&#34; vs. &#34;apple&#34;) may require various numbers of ex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13233v2-abstract-full').style.display = 'inline'; document.getElementById('2406.13233v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13233v2-abstract-full" style="display: none;"> Mixture of experts (MoE) has become the standard for constructing production-level large language models (LLMs) due to its promise to boost model capacity without causing significant overheads. Nevertheless, existing MoE methods usually enforce a constant top-k routing for all tokens, which is arguably restrictive because various tokens (e.g., &#34;&lt;EOS&gt;&#34; vs. &#34;apple&#34;) may require various numbers of experts for feature abstraction. Lifting such a constraint can help make the most of limited resources and unleash the potential of the model for downstream tasks. In this sense, we introduce AdaMoE to realize token-adaptive routing for MoE, where different tokens are permitted to select a various number of experts. AdaMoE makes minimal modifications to the vanilla MoE with top-k routing -- it simply introduces a fixed number of null experts, which do not consume any FLOPs, to the expert set and increases the value of k. AdaMoE does not force each token to occupy a fixed number of null experts but ensures the average usage of the null experts with a load-balancing loss, leading to an adaptive number of null/true experts used by each token. AdaMoE exhibits a strong resemblance to MoEs with expert choice routing while allowing for trivial auto-regressive modeling. AdaMoE is easy to implement and can be effectively applied to pre-trained (MoE-)LLMs. Extensive studies show that AdaMoE can reduce average expert load (FLOPs) while achieving superior performance. For example, on the ARC-C dataset, applying our method to fine-tuning Mixtral-8x7B can reduce FLOPs by 14.5% while increasing accuracy by 1.69%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13233v2-abstract-full').style.display = 'none'; document.getElementById('2406.13233v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11519">arXiv:2406.11519</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.11519">pdf</a>, <a href="https://arxiv.org/format/2406.11519">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> HyperSIGMA: Hyperspectral Intelligence Comprehension Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Di Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+M">Meiqi Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yao Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yuchun Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jiaqi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yichu Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+X">Xiaolei Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Jiaqi Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lingyu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenxing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+C">Chuan Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hongruixuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+C">Chengxi Han</a>, <a href="/search/cs?searchtype=author&amp;query=Yokoya%2C+N">Naoto Yokoya</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Minqiang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+L">Lin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lefei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Chen Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liangpei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11519v1-abstract-short" style="display: inline;"> Foundation models (FMs) are revolutionizing the analysis and understanding of remote sensing (RS) scenes, including aerial RGB, multispectral, and SAR images. However, hyperspectral images (HSIs), which are rich in spectral information, have not seen much application of FMs, with existing methods often restricted to specific tasks and lacking generality. To fill this gap, we introduce HyperSIGMA,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11519v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11519v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11519v1-abstract-full" style="display: none;"> Foundation models (FMs) are revolutionizing the analysis and understanding of remote sensing (RS) scenes, including aerial RGB, multispectral, and SAR images. However, hyperspectral images (HSIs), which are rich in spectral information, have not seen much application of FMs, with existing methods often restricted to specific tasks and lacking generality. To fill this gap, we introduce HyperSIGMA, a vision transformer-based foundation model for HSI interpretation, scalable to over a billion parameters. To tackle the spectral and spatial redundancy challenges in HSIs, we introduce a novel sparse sampling attention (SSA) mechanism, which effectively promotes the learning of diverse contextual features and serves as the basic block of HyperSIGMA. HyperSIGMA integrates spatial and spectral features using a specially designed spectral enhancement module. In addition, we construct a large-scale hyperspectral dataset, HyperGlobal-450K, for pre-training, which contains about 450K hyperspectral images, significantly surpassing existing datasets in scale. Extensive experiments on various high-level and low-level HSI tasks demonstrate HyperSIGMA&#39;s versatility and superior representational capability compared to current state-of-the-art methods. Moreover, HyperSIGMA shows significant advantages in scalability, robustness, cross-modal transferring capability, and real-world applicability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11519v1-abstract-full').style.display = 'none'; document.getElementById('2406.11519v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code and models will be released at https://github.com/WHU-Sigma/HyperSIGMA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07327">arXiv:2406.07327</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.07327">pdf</a>, <a href="https://arxiv.org/format/2406.07327">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> 3D-Properties: Identifying Challenges in DPO and Charting a Path Forward </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Y">Yuzi Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jialian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yipin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+J">Jian Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Z">Zhijie Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+D">Dong Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07327v1-abstract-short" style="display: inline;"> Aligning large language models (LLMs) with human preference has recently gained tremendous attention, with the canonical yet costly RLHF-PPO and the simple and straightforward Direct Preference Optimization (DPO) as two examples. Despite the efficiency, DPO has rarely be used in the state-of-the-art production-level LLMs, implying its potential pathologies. In this work, we revisit DPO with a comp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07327v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07327v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07327v1-abstract-full" style="display: none;"> Aligning large language models (LLMs) with human preference has recently gained tremendous attention, with the canonical yet costly RLHF-PPO and the simple and straightforward Direct Preference Optimization (DPO) as two examples. Despite the efficiency, DPO has rarely be used in the state-of-the-art production-level LLMs, implying its potential pathologies. In this work, we revisit DPO with a comprehensive examination of its empirical efficacy and a systematic comparison with RLHF-PPO. We identify the \textbf{3D}-properties of DPO&#39;s learning outcomes: the \textbf{D}rastic drop in the likelihood of rejected responses, the \textbf{D}egradation into LLM unlearning, and the \textbf{D}ispersion effect on unseen responses through experiments with both a carefully designed toy model and practical LLMs on tasks including mathematical problem-solving and instruction following. These findings inherently connect to some observations made by related works and we additionally contribute a plausible theoretical explanation for them. Accordingly, we propose easy regularization methods to mitigate the issues caused by \textbf{3D}-properties, improving the training stability and final performance of DPO. Our contributions also include an investigation into how the distribution of the paired preference data impacts the effectiveness of DPO. We hope this work could offer research directions to narrow the gap between reward-free preference learning methods and reward-based ones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07327v1-abstract-full').style.display = 'none'; document.getElementById('2406.07327v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00588">arXiv:2406.00588</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.00588">pdf</a>, <a href="https://arxiv.org/format/2406.00588">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Generalization Bound and New Algorithm for Clean-Label Backdoor Attack </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+L">Lijia Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shuang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+X">Xiao-Shan Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lijun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00588v1-abstract-short" style="display: inline;"> The generalization bound is a crucial theoretical tool for assessing the generalizability of learning methods and there exist vast literatures on generalizability of normal learning, adversarial learning, and data poisoning. Unlike other data poison attacks, the backdoor attack has the special property that the poisoned triggers are contained in both the training set and the test set and the purpo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00588v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00588v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00588v1-abstract-full" style="display: none;"> The generalization bound is a crucial theoretical tool for assessing the generalizability of learning methods and there exist vast literatures on generalizability of normal learning, adversarial learning, and data poisoning. Unlike other data poison attacks, the backdoor attack has the special property that the poisoned triggers are contained in both the training set and the test set and the purpose of the attack is two-fold. To our knowledge, the generalization bound for the backdoor attack has not been established. In this paper, we fill this gap by deriving algorithm-independent generalization bounds in the clean-label backdoor attack scenario. Precisely, based on the goals of backdoor attack, we give upper bounds for the clean sample population errors and the poison population errors in terms of the empirical error on the poisoned training dataset. Furthermore, based on the theoretical result, a new clean-label backdoor attack is proposed that computes the poisoning trigger by combining adversarial noise and indiscriminate poison. We show its effectiveness in a variety of settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00588v1-abstract-full').style.display = 'none'; document.getElementById('2406.00588v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19098">arXiv:2405.19098</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.19098">pdf</a>, <a href="https://arxiv.org/format/2405.19098">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Efficient Black-box Adversarial Attacks via Bayesian Optimization Guided by a Function Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+S">Shuyu Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yinpeng Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xiao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+X">Xiao-Shan Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19098v1-abstract-short" style="display: inline;"> This paper studies the challenging black-box adversarial attack that aims to generate adversarial examples against a black-box model by only using output feedback of the model to input queries. Some previous methods improve the query efficiency by incorporating the gradient of a surrogate white-box model into query-based attacks due to the adversarial transferability. However, the localized gradie&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19098v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19098v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19098v1-abstract-full" style="display: none;"> This paper studies the challenging black-box adversarial attack that aims to generate adversarial examples against a black-box model by only using output feedback of the model to input queries. Some previous methods improve the query efficiency by incorporating the gradient of a surrogate white-box model into query-based attacks due to the adversarial transferability. However, the localized gradient is not informative enough, making these methods still query-intensive. In this paper, we propose a Prior-guided Bayesian Optimization (P-BO) algorithm that leverages the surrogate model as a global function prior in black-box adversarial attacks. As the surrogate model contains rich prior information of the black-box one, P-BO models the attack objective with a Gaussian process whose mean function is initialized as the surrogate model&#39;s loss. Our theoretical analysis on the regret bound indicates that the performance of P-BO may be affected by a bad prior. Therefore, we further propose an adaptive integration strategy to automatically adjust a coefficient on the function prior by minimizing the regret bound. Extensive experiments on image classifiers and large vision-language models demonstrate the superiority of the proposed algorithm in reducing queries and improving attack success rates compared with the state-of-the-art black-box attacks. Code is available at https://github.com/yibo-miao/PBO-Attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19098v1-abstract-full').style.display = 'none'; document.getElementById('2405.19098v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18524">arXiv:2405.18524</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.18524">pdf</a>, <a href="https://arxiv.org/format/2405.18524">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Aligning in a Compact Space: Contrastive Knowledge Distillation between Heterogeneous Architectures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Hongjun Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+L">Li Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xingkuo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yining Miao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18524v1-abstract-short" style="display: inline;"> Knowledge distillation is commonly employed to compress neural networks, reducing the inference costs and memory footprint. In the scenario of homogenous architecture, feature-based methods have been widely validated for their effectiveness. However, in scenarios where the teacher and student models are of heterogeneous architectures, the inherent differences in feature representation significantl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18524v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18524v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18524v1-abstract-full" style="display: none;"> Knowledge distillation is commonly employed to compress neural networks, reducing the inference costs and memory footprint. In the scenario of homogenous architecture, feature-based methods have been widely validated for their effectiveness. However, in scenarios where the teacher and student models are of heterogeneous architectures, the inherent differences in feature representation significantly degrade the performance of these methods. Recent studies have highlighted that low-frequency components constitute the majority of image features. Motivated by this, we propose a Low-Frequency Components-based Contrastive Knowledge Distillation (LFCC) framework that significantly enhances the performance of feature-based distillation between heterogeneous architectures. Specifically, we designe a set of multi-scale low-pass filters to extract the low-frequency components of intermediate features from both the teacher and student models, aligning them in a compact space to overcome architectural disparities. Moreover, leveraging the intrinsic pairing characteristic of the teacher-student framework, we design an innovative sample-level contrastive learning framework that adeptly restructures the constraints of within-sample feature similarity and between-sample feature divergence into a contrastive learning task. This strategy enables the student model to capitalize on intra-sample feature congruence while simultaneously enhancing the discrimination of features among disparate samples. Consequently, our LFCC framework accurately captures the commonalities in feature representation across heterogeneous architectures. Extensive evaluations and empirical analyses across three architectures (CNNs, Transformers, and MLPs) demonstrate that LFCC achieves superior performance on the challenging benchmarks of ImageNet-1K and CIFAR-100. All codes will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18524v1-abstract-full').style.display = 'none'; document.getElementById('2405.18524v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 3 figures, conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15130">arXiv:2405.15130</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.15130">pdf</a>, <a href="https://arxiv.org/format/2405.15130">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OptLLM: Optimal Assignment of Queries to Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yueyue Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hongyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yuantian Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+V">Van-Hoang Le</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhiqiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15130v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have garnered considerable attention owing to their remarkable capabilities, leading to an increasing number of companies offering LLMs as services. Different LLMs achieve different performance at different costs. A challenge for users lies in choosing the LLMs that best fit their needs, balancing cost and performance. In this paper, we propose a framework for addressi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15130v1-abstract-full').style.display = 'inline'; document.getElementById('2405.15130v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15130v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have garnered considerable attention owing to their remarkable capabilities, leading to an increasing number of companies offering LLMs as services. Different LLMs achieve different performance at different costs. A challenge for users lies in choosing the LLMs that best fit their needs, balancing cost and performance. In this paper, we propose a framework for addressing the cost-effective query allocation problem for LLMs. Given a set of input queries and candidate LLMs, our framework, named OptLLM, provides users with a range of optimal solutions to choose from, aligning with their budget constraints and performance preferences, including options for maximizing accuracy and minimizing cost. OptLLM predicts the performance of candidate LLMs on each query using a multi-label classification model with uncertainty estimation and then iteratively generates a set of non-dominated solutions by destructing and reconstructing the current solution. To evaluate the effectiveness of OptLLM, we conduct extensive experiments on various types of tasks, including text classification, question answering, sentiment analysis, reasoning, and log parsing. Our experimental results demonstrate that OptLLM substantially reduces costs by 2.40% to 49.18% while achieving the same accuracy as the best LLM. Compared to other multi-objective optimization algorithms, OptLLM improves accuracy by 2.94% to 69.05% at the same cost or saves costs by 8.79% and 95.87% while maintaining the highest attainable accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15130v1-abstract-full').style.display = 'none'; document.getElementById('2405.15130v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by ICWS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01333">arXiv:2405.01333</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.01333">pdf</a>, <a href="https://arxiv.org/format/2405.01333">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeRF in Robotics: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guangming Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+L">Lei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+S">Songyou Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shaohui Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Chenfeng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yanzi Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+W">Wei Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Tomizuka%2C+M">Masayoshi Tomizuka</a>, <a href="/search/cs?searchtype=author&amp;query=Pollefeys%2C+M">Marc Pollefeys</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hesheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01333v1-abstract-short" style="display: inline;"> Meticulous 3D environment representations have been a longstanding goal in computer vision and robotics fields. The recent emergence of neural implicit representations has introduced radical innovation to this field as implicit representations enable numerous capabilities. Among these, the Neural Radiance Field (NeRF) has sparked a trend because of the huge representational advantages, such as sim&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01333v1-abstract-full').style.display = 'inline'; document.getElementById('2405.01333v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01333v1-abstract-full" style="display: none;"> Meticulous 3D environment representations have been a longstanding goal in computer vision and robotics fields. The recent emergence of neural implicit representations has introduced radical innovation to this field as implicit representations enable numerous capabilities. Among these, the Neural Radiance Field (NeRF) has sparked a trend because of the huge representational advantages, such as simplified mathematical models, compact environment storage, and continuous scene representations. Apart from computer vision, NeRF has also shown tremendous potential in the field of robotics. Thus, we create this survey to provide a comprehensive understanding of NeRF in the field of robotics. By exploring the advantages and limitations of NeRF, as well as its current applications and future potential, we hope to shed light on this promising area of research. Our survey is divided into two main sections: \textit{The Application of NeRF in Robotics} and \textit{The Advance of NeRF in Robotics}, from the perspective of how NeRF enters the field of robotics. In the first section, we introduce and analyze some works that have been or could be used in the field of robotics from the perception and interaction perspectives. In the second section, we show some works related to improving NeRF&#39;s own properties, which are essential for deploying NeRF in the field of robotics. In the discussion section of the review, we summarize the existing challenges and provide some valuable future research directions for reference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01333v1-abstract-full').style.display = 'none'; document.getElementById('2405.01333v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 19 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19534">arXiv:2404.19534</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.19534">pdf</a>, <a href="https://arxiv.org/format/2404.19534">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MIPI 2024 Challenge on Nighttime Flare Removal: Methods and Results </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Y">Yuekun Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dafeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiaoming Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yue%2C+Z">Zongsheng Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chongyi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+S">Shangchen Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+R">Ruicheng Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+P">Peiqing Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Z">Zhezhu Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+G">Guanqun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Loy%2C+C+C">Chen Change Loy</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lize Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shuai Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+C">Chaoyu Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Luyang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+G">Guangqi Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiaotao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+L">Lei Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Q">Qirui Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Q">Qihua Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhiqiang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yihao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yue%2C+H">Huanjing Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jingyu Yang</a> , et al. (38 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19534v2-abstract-short" style="display: inline;"> The increasing demand for computational photography and imaging on mobile platforms has led to the widespread development and integration of advanced image sensors with novel algorithms in camera systems. However, the scarcity of high-quality data for research and the rare opportunity for in-depth exchange of views from industry and academia constrain the development of mobile intelligent photogra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19534v2-abstract-full').style.display = 'inline'; document.getElementById('2404.19534v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19534v2-abstract-full" style="display: none;"> The increasing demand for computational photography and imaging on mobile platforms has led to the widespread development and integration of advanced image sensors with novel algorithms in camera systems. However, the scarcity of high-quality data for research and the rare opportunity for in-depth exchange of views from industry and academia constrain the development of mobile intelligent photography and imaging (MIPI). Building on the achievements of the previous MIPI Workshops held at ECCV 2022 and CVPR 2023, we introduce our third MIPI challenge including three tracks focusing on novel image sensors and imaging algorithms. In this paper, we summarize and review the Nighttime Flare Removal track on MIPI 2024. In total, 170 participants were successfully registered, and 14 teams submitted results in the final testing phase. The developed solutions in this challenge achieved state-of-the-art performance on Nighttime Flare Removal. More details of this challenge and the link to the dataset can be found at https://mipi-challenge.org/MIPI2024/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19534v2-abstract-full').style.display = 'none'; document.getElementById('2404.19534v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024 Mobile Intelligent Photography and Imaging (MIPI) Workshop--Nighttime Flare Removal Challenge Report. Website: https://mipi-challenge.org/MIPI2024/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03037">arXiv:2404.03037</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.03037">pdf</a>, <a href="https://arxiv.org/format/2404.03037">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Model-based Reinforcement Learning for Parameterized Action Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Renhao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+H">Haotian Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yilin Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Konidaris%2C+G">George Konidaris</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03037v3-abstract-short" style="display: inline;"> We propose a novel model-based reinforcement learning algorithm -- Dynamics Learning and predictive control with Parameterized Actions (DLPA) -- for Parameterized Action Markov Decision Processes (PAMDPs). The agent learns a parameterized-action-conditioned dynamics model and plans with a modified Model Predictive Path Integral control. We theoretically quantify the difference between the generate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03037v3-abstract-full').style.display = 'inline'; document.getElementById('2404.03037v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03037v3-abstract-full" style="display: none;"> We propose a novel model-based reinforcement learning algorithm -- Dynamics Learning and predictive control with Parameterized Actions (DLPA) -- for Parameterized Action Markov Decision Processes (PAMDPs). The agent learns a parameterized-action-conditioned dynamics model and plans with a modified Model Predictive Path Integral control. We theoretically quantify the difference between the generated trajectory and the optimal trajectory during planning in terms of the value they achieved through the lens of Lipschitz Continuity. Our empirical results on several standard benchmarks show that our algorithm achieves superior sample efficiency and asymptotic performance than state-of-the-art PAMDP methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03037v3-abstract-full').style.display = 'none'; document.getElementById('2404.03037v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00469">arXiv:2404.00469</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.00469">pdf</a>, <a href="https://arxiv.org/format/2404.00469">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SceneGraphLoc: Cross-Modal Coarse Visual Localization on 3D Scene Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yang Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Engelmann%2C+F">Francis Engelmann</a>, <a href="/search/cs?searchtype=author&amp;query=Vysotska%2C+O">Olga Vysotska</a>, <a href="/search/cs?searchtype=author&amp;query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&amp;query=Pollefeys%2C+M">Marc Pollefeys</a>, <a href="/search/cs?searchtype=author&amp;query=Bar%C3%A1th%2C+D+B">D谩niel B茅la Bar谩th</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00469v3-abstract-short" style="display: inline;"> We introduce a novel problem, i.e., the localization of an input image within a multi-modal reference map represented by a database of 3D scene graphs. These graphs comprise multiple modalities, including object-level point clouds, images, attributes, and relationships between objects, offering a lightweight and efficient alternative to conventional methods that rely on extensive image databases.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00469v3-abstract-full').style.display = 'inline'; document.getElementById('2404.00469v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00469v3-abstract-full" style="display: none;"> We introduce a novel problem, i.e., the localization of an input image within a multi-modal reference map represented by a database of 3D scene graphs. These graphs comprise multiple modalities, including object-level point clouds, images, attributes, and relationships between objects, offering a lightweight and efficient alternative to conventional methods that rely on extensive image databases. Given the available modalities, the proposed method SceneGraphLoc learns a fixed-sized embedding for each node (i.e., representing an object instance) in the scene graph, enabling effective matching with the objects visible in the input query image. This strategy significantly outperforms other cross-modal methods, even without incorporating images into the map embeddings. When images are leveraged, SceneGraphLoc achieves performance close to that of state-of-the-art techniques depending on large image databases, while requiring three orders-of-magnitude less storage and operating orders-of-magnitude faster. The code will be made public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00469v3-abstract-full').style.display = 'none'; document.getElementById('2404.00469v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00312">arXiv:2404.00312</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.00312">pdf</a>, <a href="https://arxiv.org/format/2404.00312">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bayesian Exploration of Pre-trained Models for Low-shot Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+Y">Yu Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+F">Feng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Z">Zhijie Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00312v1-abstract-short" style="display: inline;"> Low-shot image classification is a fundamental task in computer vision, and the emergence of large-scale vision-language models such as CLIP has greatly advanced the forefront of research in this field. However, most existing CLIP-based methods lack the flexibility to effectively incorporate other pre-trained models that encompass knowledge distinct from CLIP. To bridge the gap, this work proposes&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00312v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00312v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00312v1-abstract-full" style="display: none;"> Low-shot image classification is a fundamental task in computer vision, and the emergence of large-scale vision-language models such as CLIP has greatly advanced the forefront of research in this field. However, most existing CLIP-based methods lack the flexibility to effectively incorporate other pre-trained models that encompass knowledge distinct from CLIP. To bridge the gap, this work proposes a simple and effective probabilistic model ensemble framework based on Gaussian processes, which have previously demonstrated remarkable efficacy in processing small data. We achieve the integration of prior knowledge by specifying the mean function with CLIP and the kernel function with an ensemble of deep kernels built upon various pre-trained models. By regressing the classification label directly, our framework enables analytical inference, straightforward uncertainty quantification, and principled hyper-parameter tuning. Through extensive experiments on standard benchmarks, we demonstrate that our method consistently outperforms competitive ensemble baselines regarding predictive performance. Additionally, we assess the robustness of our method and the quality of the yielded uncertainty estimates on out-of-distribution datasets. We also illustrate that our method, despite relying on label regression, still enjoys superior model calibration compared to most deterministic baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00312v1-abstract-full').style.display = 'none'; document.getElementById('2404.00312v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12760">arXiv:2403.12760</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.12760">pdf</a>, <a href="https://arxiv.org/format/2403.12760">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WaveFace: Authentic Face Restoration with Efficient Frequency Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yunqi Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+J">Jiankang Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+J">Jungong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12760v1-abstract-short" style="display: inline;"> Although diffusion models are rising as a powerful solution for blind face restoration, they are criticized for two problems: 1) slow training and inference speed, and 2) failure in preserving identity and recovering fine-grained facial details. In this work, we propose WaveFace to solve the problems in the frequency domain, where low- and high-frequency components decomposed by wavelet transforma&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12760v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12760v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12760v1-abstract-full" style="display: none;"> Although diffusion models are rising as a powerful solution for blind face restoration, they are criticized for two problems: 1) slow training and inference speed, and 2) failure in preserving identity and recovering fine-grained facial details. In this work, we propose WaveFace to solve the problems in the frequency domain, where low- and high-frequency components decomposed by wavelet transformation are considered individually to maximize authenticity as well as efficiency. The diffusion model is applied to recover the low-frequency component only, which presents general information of the original image but 1/16 in size. To preserve the original identity, the generation is conditioned on the low-frequency component of low-quality images at each denoising step. Meanwhile, high-frequency components at multiple decomposition levels are handled by a unified network, which recovers complex facial details in a single step. Evaluations on four benchmark datasets show that: 1) WaveFace outperforms state-of-the-art methods in authenticity, especially in terms of identity preservation, and 2) authentic images are restored with the efficiency 10x faster than existing diffusion model-based BFR methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12760v1-abstract-full').style.display = 'none'; document.getElementById('2403.12760v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05530">arXiv:2403.05530</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.05530">pdf</a>, <a href="https://arxiv.org/format/2403.05530">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gemini+Team"> Gemini Team</a>, <a href="/search/cs?searchtype=author&amp;query=Georgiev%2C+P">Petko Georgiev</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+V+I">Ving Ian Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Burnell%2C+R">Ryan Burnell</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+L">Libin Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Gulati%2C+A">Anmol Gulati</a>, <a href="/search/cs?searchtype=author&amp;query=Tanzer%2C+G">Garrett Tanzer</a>, <a href="/search/cs?searchtype=author&amp;query=Vincent%2C+D">Damien Vincent</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Z">Zhufeng Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shibo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Mariooryad%2C+S">Soroosh Mariooryad</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Y">Yifan Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+X">Xinyang Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Alcober%2C+F">Fred Alcober</a>, <a href="/search/cs?searchtype=author&amp;query=Frostig%2C+R">Roy Frostig</a>, <a href="/search/cs?searchtype=author&amp;query=Omernick%2C+M">Mark Omernick</a>, <a href="/search/cs?searchtype=author&amp;query=Walker%2C+L">Lexi Walker</a>, <a href="/search/cs?searchtype=author&amp;query=Paduraru%2C+C">Cosmin Paduraru</a>, <a href="/search/cs?searchtype=author&amp;query=Sorokin%2C+C">Christina Sorokin</a>, <a href="/search/cs?searchtype=author&amp;query=Tacchetti%2C+A">Andrea Tacchetti</a>, <a href="/search/cs?searchtype=author&amp;query=Gaffney%2C+C">Colin Gaffney</a>, <a href="/search/cs?searchtype=author&amp;query=Daruki%2C+S">Samira Daruki</a>, <a href="/search/cs?searchtype=author&amp;query=Sercinoglu%2C+O">Olcan Sercinoglu</a>, <a href="/search/cs?searchtype=author&amp;query=Gleicher%2C+Z">Zach Gleicher</a>, <a href="/search/cs?searchtype=author&amp;query=Love%2C+J">Juliette Love</a> , et al. (1110 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05530v4-abstract-short" style="display: inline;"> In this report, we introduce the Gemini 1.5 family of models, representing the next generation of highly compute-efficient multimodal models capable of recalling and reasoning over fine-grained information from millions of tokens of context, including multiple long documents and hours of video and audio. The family includes two new models: (1) an updated Gemini 1.5 Pro, which exceeds the February&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05530v4-abstract-full').style.display = 'inline'; document.getElementById('2403.05530v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05530v4-abstract-full" style="display: none;"> In this report, we introduce the Gemini 1.5 family of models, representing the next generation of highly compute-efficient multimodal models capable of recalling and reasoning over fine-grained information from millions of tokens of context, including multiple long documents and hours of video and audio. The family includes two new models: (1) an updated Gemini 1.5 Pro, which exceeds the February version on the great majority of capabilities and benchmarks; (2) Gemini 1.5 Flash, a more lightweight variant designed for efficiency with minimal regression in quality. Gemini 1.5 models achieve near-perfect recall on long-context retrieval tasks across modalities, improve the state-of-the-art in long-document QA, long-video QA and long-context ASR, and match or surpass Gemini 1.0 Ultra&#39;s state-of-the-art performance across a broad set of benchmarks. Studying the limits of Gemini 1.5&#39;s long-context ability, we find continued improvement in next-token prediction and near-perfect retrieval (&gt;99%) up to at least 10M tokens, a generational leap over existing models such as Claude 3.0 (200k) and GPT-4 Turbo (128k). Finally, we highlight real-world use cases, such as Gemini 1.5 collaborating with professionals on completing their tasks achieving 26 to 75% time savings across 10 different job categories, as well as surprising new capabilities of large language models at the frontier; when given a grammar manual for Kalamang, a language with fewer than 200 speakers worldwide, the model learns to translate English to Kalamang at a similar level to a person who learned from the same content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05530v4-abstract-full').style.display = 'none'; document.getElementById('2403.05530v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.04164">arXiv:2403.04164</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.04164">pdf</a>, <a href="https://arxiv.org/format/2403.04164">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ProMISe: Promptable Medical Image Segmentation using SAM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jinfeng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+S">Sifan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinkun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yiyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yiyi Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+J">Jionglong Su</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+S+K">S. Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.04164v3-abstract-short" style="display: inline;"> With the proposal of the Segment Anything Model (SAM), fine-tuning SAM for medical image segmentation (MIS) has become popular. However, due to the large size of the SAM model and the significant domain gap between natural and medical images, fine-tuning-based strategies are costly with potential risk of instability, feature damage and catastrophic forgetting. Furthermore, some methods of transfer&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04164v3-abstract-full').style.display = 'inline'; document.getElementById('2403.04164v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.04164v3-abstract-full" style="display: none;"> With the proposal of the Segment Anything Model (SAM), fine-tuning SAM for medical image segmentation (MIS) has become popular. However, due to the large size of the SAM model and the significant domain gap between natural and medical images, fine-tuning-based strategies are costly with potential risk of instability, feature damage and catastrophic forgetting. Furthermore, some methods of transferring SAM to a domain-specific MIS through fine-tuning strategies disable the model&#39;s prompting capability, severely limiting its utilization scenarios. In this paper, we propose an Auto-Prompting Module (APM), which provides SAM-based foundation model with Euclidean adaptive prompts in the target domain. Our experiments demonstrate that such adaptive prompts significantly improve SAM&#39;s non-fine-tuned performance in MIS. In addition, we propose a novel non-invasive method called Incremental Pattern Shifting (IPS) to adapt SAM to specific medical domains. Experimental results show that the IPS enables SAM to achieve state-of-the-art or competitive performance in MIS without the need for fine-tuning. By coupling these two methods, we propose ProMISe, an end-to-end non-fine-tuned framework for Promptable Medical Image Segmentation. Our experiments demonstrate that both using our methods individually or in combination achieves satisfactory performance in low-cost pattern shifting, with all of SAM&#39;s parameters frozen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04164v3-abstract-full').style.display = 'none'; document.getElementById('2403.04164v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.02558">arXiv:2403.02558</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.02558">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Minimum Information about CLinical Artificial Intelligence Checklist for Generative Modeling Research (MI-CLAIM-GEN) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+B+Y">Brenda Y. Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+I+Y">Irene Y. Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Williams%2C+C+Y">Christopher YK Williams</a>, <a href="/search/cs?searchtype=author&amp;query=Davidson%2C+J">Jays贸n Davidson</a>, <a href="/search/cs?searchtype=author&amp;query=Garcia-Agundez%2C+A">Augusto Garcia-Agundez</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+S">Shenghuan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zack%2C+T">Travis Zack</a>, <a href="/search/cs?searchtype=author&amp;query=Saria%2C+S">Suchi Saria</a>, <a href="/search/cs?searchtype=author&amp;query=Arnaout%2C+R">Rima Arnaout</a>, <a href="/search/cs?searchtype=author&amp;query=Quer%2C+G">Giorgio Quer</a>, <a href="/search/cs?searchtype=author&amp;query=Sadaei%2C+H+J">Hossein J. Sadaei</a>, <a href="/search/cs?searchtype=author&amp;query=Torkamani%2C+A">Ali Torkamani</a>, <a href="/search/cs?searchtype=author&amp;query=Beaulieu-Jones%2C+B">Brett Beaulieu-Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Gianfrancesco%2C+M">Milena Gianfrancesco</a>, <a href="/search/cs?searchtype=author&amp;query=Butte%2C+A+J">Atul J. Butte</a>, <a href="/search/cs?searchtype=author&amp;query=Norgeot%2C+B">Beau Norgeot</a>, <a href="/search/cs?searchtype=author&amp;query=Sushil%2C+M">Madhumita Sushil</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.02558v2-abstract-short" style="display: inline;"> Recent advances in generative models, including large language models (LLMs), vision language models (VLMs), and diffusion models, have accelerated the field of natural language and image processing in medicine and marked a significant paradigm shift in how biomedical models can be developed and deployed. While these models are highly adaptable to new tasks, scaling and evaluating their usage pres&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02558v2-abstract-full').style.display = 'inline'; document.getElementById('2403.02558v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.02558v2-abstract-full" style="display: none;"> Recent advances in generative models, including large language models (LLMs), vision language models (VLMs), and diffusion models, have accelerated the field of natural language and image processing in medicine and marked a significant paradigm shift in how biomedical models can be developed and deployed. While these models are highly adaptable to new tasks, scaling and evaluating their usage presents new challenges not addressed in previous frameworks. In particular, the ability of these models to produce useful outputs with little to no specialized training data (&#34;zero-&#34; or &#34;few-shot&#34; approaches), as well as the open-ended nature of their outputs, necessitate the development of new guidelines for robust reporting of clinical generative model research. In response to gaps in standards and best practices for the development of clinical AI tools identified by US Executive Order 141103 and several emerging national networks for clinical AI evaluation, we begin to formalize some of these guidelines by building on the original MI-CLAIM checklist. The new checklist, MI-CLAIM-GEN (Table 1), aims to address differences in training, evaluation, interpretability, and reproducibility of new generative models compared to non-generative (&#34;predictive&#34;) AI models. This MI-CLAIM-GEN checklist also seeks to clarify cohort selection reporting with unstructured clinical data and adds additional items on alignment with ethical standards for clinical AI research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02558v2-abstract-full').style.display = 'none'; document.getElementById('2403.02558v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.15813">arXiv:2402.15813</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.15813">pdf</a>, <a href="https://arxiv.org/format/2402.15813">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Measuring Bargaining Abilities of LLMs: A Benchmark and A Buyer-Enhancement Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xia%2C+T">Tian Xia</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Z">Zhiwei He</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+T">Tong Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhuosheng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.15813v3-abstract-short" style="display: inline;"> Bargaining is an important and unique part of negotiation between humans. As LLM-driven agents learn to negotiate and act like real humans, how to evaluate agents&#39; bargaining abilities remains an open problem. For the first time, we formally described the Bargaining task as an asymmetric incomplete information game, defining the gains of the Buyer and Seller in multiple bargaining processes. It al&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15813v3-abstract-full').style.display = 'inline'; document.getElementById('2402.15813v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.15813v3-abstract-full" style="display: none;"> Bargaining is an important and unique part of negotiation between humans. As LLM-driven agents learn to negotiate and act like real humans, how to evaluate agents&#39; bargaining abilities remains an open problem. For the first time, we formally described the Bargaining task as an asymmetric incomplete information game, defining the gains of the Buyer and Seller in multiple bargaining processes. It allows us to quantitatively assess an agent&#39;s performance in the Bargain task. We collected a real product price dataset, AmazonHistoryPrice, and conducted evaluations of various LLM agents&#39; bargaining abilities. We find that playing a Buyer is much harder than a Seller, and increasing model size can not effectively improve the Buyer&#39;s performance. To address the challenge, we propose a novel approach called OG-Narrator that integrates a deterministic Offer Generator to control the price range of Buyer&#39;s offers, and an LLM Narrator to create natural language sentences for generated offers. Experimental results show that OG-Narrator improves the buyer&#39;s deal rates from 26.67% to 88.88% and brings a ten times multiplication of profits on all baselines, even a model that has not been aligned. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15813v3-abstract-full').style.display = 'none'; document.getElementById('2402.15813v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2024 Findings. The dataset AmazonHistoryPrice and our code are available at https://github.com/TianXiaSJTU/AmazonPriceHistory</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.09345">arXiv:2402.09345</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.09345">pdf</a>, <a href="https://arxiv.org/format/2402.09345">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> InfoRM: Mitigating Reward Hacking in RLHF via Information-Theoretic Reward Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yuchun Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Sen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+L">Liang Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+R">Rong Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lefei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.09345v5-abstract-short" style="display: inline;"> Despite the success of reinforcement learning from human feedback (RLHF) in aligning language models with human values, reward hacking, also termed reward overoptimization, remains a critical challenge. This issue primarily arises from reward misgeneralization, where reward models (RMs) compute reward using spurious features that are irrelevant to human preferences. In this work, we tackle this pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09345v5-abstract-full').style.display = 'inline'; document.getElementById('2402.09345v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.09345v5-abstract-full" style="display: none;"> Despite the success of reinforcement learning from human feedback (RLHF) in aligning language models with human values, reward hacking, also termed reward overoptimization, remains a critical challenge. This issue primarily arises from reward misgeneralization, where reward models (RMs) compute reward using spurious features that are irrelevant to human preferences. In this work, we tackle this problem from an information-theoretic perspective and propose a framework for reward modeling, namely InfoRM, by introducing a variational information bottleneck objective to filter out irrelevant information. Notably, we further identify a correlation between overoptimization and outliers in the IB latent space of InfoRM, establishing it as a promising tool for detecting reward overoptimization. Inspired by this finding, we propose the Cluster Separation Index (CSI), which quantifies deviations in the IB latent space, as an indicator of reward overoptimization to facilitate the development of online mitigation strategies. Extensive experiments on a wide range of settings and RM scales (70M, 440M, 1.4B, and 7B) demonstrate the effectiveness of InfoRM. Further analyses reveal that InfoRM&#39;s overoptimization detection mechanism is not only effective but also robust across a broad range of datasets, signifying a notable advancement in the field of RLHF. The code will be released upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09345v5-abstract-full').style.display = 'none'; document.getElementById('2402.09345v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper has been accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05821">arXiv:2402.05821</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.05821">pdf</a>, <a href="https://arxiv.org/format/2402.05821">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Guided Evolution with Binary Discriminators for ML Program Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Co-Reyes%2C+J+D">John D. Co-Reyes</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yingjie Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Tucker%2C+G">George Tucker</a>, <a href="/search/cs?searchtype=author&amp;query=Faust%2C+A">Aleksandra Faust</a>, <a href="/search/cs?searchtype=author&amp;query=Real%2C+E">Esteban Real</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05821v1-abstract-short" style="display: inline;"> How to automatically design better machine learning programs is an open problem within AutoML. While evolution has been a popular tool to search for better ML programs, using learning itself to guide the search has been less successful and less understood on harder problems but has the promise to dramatically increase the speed and final performance of the optimization process. We propose guiding&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05821v1-abstract-full').style.display = 'inline'; document.getElementById('2402.05821v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05821v1-abstract-full" style="display: none;"> How to automatically design better machine learning programs is an open problem within AutoML. While evolution has been a popular tool to search for better ML programs, using learning itself to guide the search has been less successful and less understood on harder problems but has the promise to dramatically increase the speed and final performance of the optimization process. We propose guiding evolution with a binary discriminator, trained online to distinguish which program is better given a pair of programs. The discriminator selects better programs without having to perform a costly evaluation and thus speed up the convergence of evolution. Our method can encode a wide variety of ML components including symbolic optimizers, neural architectures, RL loss functions, and symbolic regression equations with the same directed acyclic graph representation. By combining this representation with modern GNNs and an adaptive mutation strategy, we demonstrate our method can speed up evolution across a set of diverse problems including a 3.7x speedup on the symbolic search for ML optimizers and a 4x speedup for RL loss functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05821v1-abstract-full').style.display = 'none'; document.getElementById('2402.05821v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.03597">arXiv:2402.03597</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.03597">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Identifying Reasons for Contraceptive Switching from Real-World Data Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+B+Y">Brenda Y. Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Williams%2C+C+Y">Christopher YK Williams</a>, <a href="/search/cs?searchtype=author&amp;query=Chinedu-Eneh%2C+E">Ebenezer Chinedu-Eneh</a>, <a href="/search/cs?searchtype=author&amp;query=Zack%2C+T">Travis Zack</a>, <a href="/search/cs?searchtype=author&amp;query=Alsentzer%2C+E">Emily Alsentzer</a>, <a href="/search/cs?searchtype=author&amp;query=Butte%2C+A+J">Atul J. Butte</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+I+Y">Irene Y. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.03597v1-abstract-short" style="display: inline;"> Prescription contraceptives play a critical role in supporting women&#39;s reproductive health. With nearly 50 million women in the United States using contraceptives, understanding the factors that drive contraceptives selection and switching is of significant interest. However, many factors related to medication switching are often only captured in unstructured clinical notes and can be difficult to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03597v1-abstract-full').style.display = 'inline'; document.getElementById('2402.03597v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.03597v1-abstract-full" style="display: none;"> Prescription contraceptives play a critical role in supporting women&#39;s reproductive health. With nearly 50 million women in the United States using contraceptives, understanding the factors that drive contraceptives selection and switching is of significant interest. However, many factors related to medication switching are often only captured in unstructured clinical notes and can be difficult to extract. Here, we evaluate the zero-shot abilities of a recently developed large language model, GPT-4 (via HIPAA-compliant Microsoft Azure API), to identify reasons for switching between classes of contraceptives from the UCSF Information Commons clinical notes dataset. We demonstrate that GPT-4 can accurately extract reasons for contraceptive switching, outperforming baseline BERT-based models with microF1 scores of 0.849 and 0.881 for contraceptive start and stop extraction, respectively. Human evaluation of GPT-4-extracted reasons for switching showed 91.4% accuracy, with minimal hallucinations. Using extracted reasons, we identified patient preference, adverse events, and insurance as key reasons for switching using unsupervised topic modeling approaches. Notably, we also showed using our approach that &#34;weight gain/mood change&#34; and &#34;insurance coverage&#34; are disproportionately found as reasons for contraceptive switching in specific demographic populations. Our code and supplemental data are available at https://github.com/BMiao10/contraceptive-switching. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03597v1-abstract-full').style.display = 'none'; document.getElementById('2402.03597v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.05568">arXiv:2401.05568</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.05568">pdf</a>, <a href="https://arxiv.org/format/2401.05568">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> Phase discovery with active learning: Application to structural phase transitions in equiatomic NiTi </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vandermause%2C+J">Jonathan Vandermause</a>, <a href="/search/cs?searchtype=author&amp;query=Johansson%2C+A">Anders Johansson</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yucong Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Vlassak%2C+J+J">Joost J. Vlassak</a>, <a href="/search/cs?searchtype=author&amp;query=Kozinsky%2C+B">Boris Kozinsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.05568v1-abstract-short" style="display: inline;"> Nickel titanium (NiTi) is a protypical shape-memory alloy used in a range of biomedical and engineering devices, but direct molecular dynamics simulations of the martensitic B19&#39; -&gt; B2 phase transition driving its shape-memory behavior are rare and have relied on classical force fields with limited accuracy. Here, we train four machine-learned force fields for equiatomic NiTi based on the LDA, PBE&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05568v1-abstract-full').style.display = 'inline'; document.getElementById('2401.05568v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.05568v1-abstract-full" style="display: none;"> Nickel titanium (NiTi) is a protypical shape-memory alloy used in a range of biomedical and engineering devices, but direct molecular dynamics simulations of the martensitic B19&#39; -&gt; B2 phase transition driving its shape-memory behavior are rare and have relied on classical force fields with limited accuracy. Here, we train four machine-learned force fields for equiatomic NiTi based on the LDA, PBE, PBEsol, and SCAN DFT functionals. The models are trained on the fly during NPT molecular dynamics, with DFT calculations and model updates performed automatically whenever the uncertainty of a local energy prediction exceeds a chosen threshold. The models achieve accuracies of 1-2 meV/atom during training and are shown to closely track DFT predictions of B2 and B19&#39; elastic constants and phonon frequencies. Surprisingly, in large-scale molecular dynamics simulations, only the SCAN model predicts a reversible B19&#39; -&gt; B2 phase transition, with the LDA, PBE, and PBEsol models predicting a reversible transition to a previously uncharacterized low-volume phase, which we hypothesize to be a new stable high-pressure phase. We examine the structure of the new phase and estimate its stability on the temperature-pressure phase diagram. This work establishes an automated active learning protocol for studying displacive transformations, reveals important differences between DFT functionals that can only be detected in large-scale simulations, provides an accurate force field for NiTi, and identifies a new phase. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05568v1-abstract-full').style.display = 'none'; document.getElementById('2401.05568v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.00434">arXiv:2401.00434</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.00434">pdf</a>, <a href="https://arxiv.org/format/2401.00434">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GeoGalactica: A Scientific Large Language Model in Geoscience </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhouhan Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+C">Cheng Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+L">Le Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tianhang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yutong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Z">Zhongmou He</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Y">Yuanyuan Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+B">Beiya Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yunchong Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+B">Boyi Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qiyuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yuxun Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+B">Bo Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+L">Luoyi Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weinan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Junxian He</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yunqiang Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinbing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+C">Chenghu Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.00434v2-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved huge success for their general knowledge and ability to solve a wide spectrum of tasks in natural language processing (NLP). Due to their impressive abilities, LLMs have shed light on potential inter-discipline applications to foster scientific discoveries of a specific domain by using artificial intelligence (AI for science, AI4S). In the meantime, utili&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00434v2-abstract-full').style.display = 'inline'; document.getElementById('2401.00434v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.00434v2-abstract-full" style="display: none;"> Large language models (LLMs) have achieved huge success for their general knowledge and ability to solve a wide spectrum of tasks in natural language processing (NLP). Due to their impressive abilities, LLMs have shed light on potential inter-discipline applications to foster scientific discoveries of a specific domain by using artificial intelligence (AI for science, AI4S). In the meantime, utilizing NLP techniques in geoscience research and practice is wide and convoluted, contributing from knowledge extraction and document classification to question answering and knowledge discovery. In this work, we take the initial step to leverage LLM for science, through a rather straightforward approach. We try to specialize an LLM into geoscience, by further pre-training the model with a vast amount of texts in geoscience, as well as supervised fine-tuning (SFT) the resulting model with our custom collected instruction tuning dataset. These efforts result in a model GeoGalactica consisting of 30 billion parameters. To our best knowledge, it is the largest language model for the geoscience domain. More specifically, GeoGalactica is from further pre-training of Galactica. We train GeoGalactica over a geoscience-related text corpus containing 65 billion tokens, preserving as the largest geoscience-specific text corpus. Then we fine-tune the model with 1 million pairs of instruction-tuning data consisting of questions that demand professional geoscience knowledge to answer. In this technical report, we will illustrate in detail all aspects of GeoGalactica, including data collection, data cleaning, base model selection, pre-training, SFT, and evaluation. We open-source our data curation tools and the checkpoints of GeoGalactica during the first 3/4 of pre-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00434v2-abstract-full').style.display = 'none'; document.getElementById('2401.00434v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; F.4.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.12803">arXiv:2312.12803</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.12803">pdf</a>, <a href="https://arxiv.org/format/2312.12803">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Repairing Schemes for Tamo-Barg Codes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cai%2C+H">Han Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Ying Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Schwartz%2C+M">Moshe Schwartz</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+X">Xiaohu Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.12803v2-abstract-short" style="display: inline;"> In this paper, the repair problem for erasures beyond locality in locally repairable codes is explored under a practical system setting, where a rack-aware storage system consists of racks, each containing a few parity checks. This is referred to as a rack-aware system with locality. Two repair schemes are devised to reduce the repair bandwidth for Tamo-Barg codes under the rack-aware model by set&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12803v2-abstract-full').style.display = 'inline'; document.getElementById('2312.12803v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.12803v2-abstract-full" style="display: none;"> In this paper, the repair problem for erasures beyond locality in locally repairable codes is explored under a practical system setting, where a rack-aware storage system consists of racks, each containing a few parity checks. This is referred to as a rack-aware system with locality. Two repair schemes are devised to reduce the repair bandwidth for Tamo-Barg codes under the rack-aware model by setting each repair set as a rack. Additionally, a cut-set bound for locally repairable codes under the rack-aware model with locality is introduced. Using this bound, the second repair scheme is proven to be optimal. Furthermore, the partial-repair problem is considered for locally repairable codes under the rack-aware model with locality, and both repair schemes and bounds are introduced for this scenario. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12803v2-abstract-full').style.display = 'none'; document.getElementById('2312.12803v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11805">arXiv:2312.11805</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.11805">pdf</a>, <a href="https://arxiv.org/format/2312.11805">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Gemini: A Family of Highly Capable Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gemini+Team"> Gemini Team</a>, <a href="/search/cs?searchtype=author&amp;query=Anil%2C+R">Rohan Anil</a>, <a href="/search/cs?searchtype=author&amp;query=Borgeaud%2C+S">Sebastian Borgeaud</a>, <a href="/search/cs?searchtype=author&amp;query=Alayrac%2C+J">Jean-Baptiste Alayrac</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jiahui Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Soricut%2C+R">Radu Soricut</a>, <a href="/search/cs?searchtype=author&amp;query=Schalkwyk%2C+J">Johan Schalkwyk</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+A+M">Andrew M. Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Hauth%2C+A">Anja Hauth</a>, <a href="/search/cs?searchtype=author&amp;query=Millican%2C+K">Katie Millican</a>, <a href="/search/cs?searchtype=author&amp;query=Silver%2C+D">David Silver</a>, <a href="/search/cs?searchtype=author&amp;query=Johnson%2C+M">Melvin Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Antonoglou%2C+I">Ioannis Antonoglou</a>, <a href="/search/cs?searchtype=author&amp;query=Schrittwieser%2C+J">Julian Schrittwieser</a>, <a href="/search/cs?searchtype=author&amp;query=Glaese%2C+A">Amelia Glaese</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jilin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pitler%2C+E">Emily Pitler</a>, <a href="/search/cs?searchtype=author&amp;query=Lillicrap%2C+T">Timothy Lillicrap</a>, <a href="/search/cs?searchtype=author&amp;query=Lazaridou%2C+A">Angeliki Lazaridou</a>, <a href="/search/cs?searchtype=author&amp;query=Firat%2C+O">Orhan Firat</a>, <a href="/search/cs?searchtype=author&amp;query=Molloy%2C+J">James Molloy</a>, <a href="/search/cs?searchtype=author&amp;query=Isard%2C+M">Michael Isard</a>, <a href="/search/cs?searchtype=author&amp;query=Barham%2C+P+R">Paul R. Barham</a>, <a href="/search/cs?searchtype=author&amp;query=Hennigan%2C+T">Tom Hennigan</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+B">Benjamin Lee</a> , et al. (1325 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11805v4-abstract-short" style="display: inline;"> This report introduces a new family of multimodal models, Gemini, that exhibit remarkable capabilities across image, audio, video, and text understanding. The Gemini family consists of Ultra, Pro, and Nano sizes, suitable for applications ranging from complex reasoning tasks to on-device memory-constrained use-cases. Evaluation on a broad range of benchmarks shows that our most-capable Gemini Ultr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11805v4-abstract-full').style.display = 'inline'; document.getElementById('2312.11805v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11805v4-abstract-full" style="display: none;"> This report introduces a new family of multimodal models, Gemini, that exhibit remarkable capabilities across image, audio, video, and text understanding. The Gemini family consists of Ultra, Pro, and Nano sizes, suitable for applications ranging from complex reasoning tasks to on-device memory-constrained use-cases. Evaluation on a broad range of benchmarks shows that our most-capable Gemini Ultra model advances the state of the art in 30 of 32 of these benchmarks - notably being the first model to achieve human-expert performance on the well-studied exam benchmark MMLU, and improving the state of the art in every one of the 20 multimodal benchmarks we examined. We believe that the new capabilities of the Gemini family in cross-modal reasoning and language understanding will enable a wide variety of use cases. We discuss our approach toward post-training and deploying Gemini models responsibly to users through services including Gemini, Gemini Advanced, Google AI Studio, and Cloud Vertex AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11805v4-abstract-full').style.display = 'none'; document.getElementById('2312.11805v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.00413">arXiv:2312.00413</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.00413">pdf</a>, <a href="https://arxiv.org/format/2312.00413">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Abstract Syntax Tree for Programming Language Understanding and Representation: How Far Are We? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+W">Weisong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+C">Chunrong Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yun Miao</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+Y">Yudu You</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+M">Mengzhe Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yuchen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Quanjun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+A">An Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhenyu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.00413v1-abstract-short" style="display: inline;"> Programming language understanding and representation (a.k.a code representation learning) has always been a hot and challenging task in software engineering. It aims to apply deep learning techniques to produce numerical representations of the source code features while preserving its semantics. These representations can be used for facilitating subsequent code-related tasks. The abstract syntax&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00413v1-abstract-full').style.display = 'inline'; document.getElementById('2312.00413v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.00413v1-abstract-full" style="display: none;"> Programming language understanding and representation (a.k.a code representation learning) has always been a hot and challenging task in software engineering. It aims to apply deep learning techniques to produce numerical representations of the source code features while preserving its semantics. These representations can be used for facilitating subsequent code-related tasks. The abstract syntax tree (AST), a fundamental code feature, illustrates the syntactic information of the source code and has been widely used in code representation learning. However, there is still a lack of systematic and quantitative evaluation of how well AST-based code representation facilitates subsequent code-related tasks. In this paper, we first conduct a comprehensive empirical study to explore the effectiveness of the AST-based code representation in facilitating follow-up code-related tasks. To do so, we compare the performance of models trained with code token sequence (Token for short) based code representation and AST-based code representation on three popular types of code-related tasks. Surprisingly, the overall quantitative statistical results demonstrate that models trained with AST-based code representation consistently perform worse across all three tasks compared to models trained with Token-based code representation. Our further quantitative analysis reveals that models trained with AST-based code representation outperform models trained with Token-based code representation in certain subsets of samples across all three tasks. We also conduct comprehensive experiments to evaluate and reveal the impact of the choice of AST parsing/preprocessing/encoding methods on AST-based code representation and subsequent code-related tasks. Our study provides future researchers with detailed guidance on how to select solutions at each stage to fully exploit AST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00413v1-abstract-full').style.display = 'none'; document.getElementById('2312.00413v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ACM Transactions on Software Engineering and Methodology. arXiv admin note: text overlap with arXiv:2103.10668 by other authors</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68-04; 68T30 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.3; I.2.2; I.2.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15269">arXiv:2311.15269</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.15269">pdf</a>, <a href="https://arxiv.org/format/2311.15269">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Tessel: Boosting Distributed Execution of Large DNN Models via Flexible Schedule Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhiqi Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Youshan Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+G">Guanbin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Saarikivi%2C+O">Olli Saarikivi</a>, <a href="/search/cs?searchtype=author&amp;query=Maleki%2C+S">Saeed Maleki</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+F">Fan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15269v1-abstract-short" style="display: inline;"> Increasingly complex and diverse deep neural network (DNN) models necessitate distributing the execution across multiple devices for training and inference tasks, and also require carefully planned schedules for performance. However, existing practices often rely on predefined schedules that may not fully exploit the benefits of emerging diverse model-aware operator placement strategies. Handcraft&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15269v1-abstract-full').style.display = 'inline'; document.getElementById('2311.15269v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15269v1-abstract-full" style="display: none;"> Increasingly complex and diverse deep neural network (DNN) models necessitate distributing the execution across multiple devices for training and inference tasks, and also require carefully planned schedules for performance. However, existing practices often rely on predefined schedules that may not fully exploit the benefits of emerging diverse model-aware operator placement strategies. Handcrafting high-efficiency schedules can be challenging due to the large and varying schedule space. This paper presents Tessel, an automated system that searches for efficient schedules for distributed DNN training and inference for diverse operator placement strategies. To reduce search costs, Tessel leverages the insight that the most efficient schedules often exhibit repetitive pattern (repetend) across different data inputs. This leads to a two-phase approach: repetend construction and schedule completion. By exploring schedules for various operator placement strategies, Tessel significantly improves both training and inference performance. Experiments with representative DNN models demonstrate that Tessel achieves up to 5.5x training performance speedup and up to 38% inference latency reduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15269v1-abstract-full').style.display = 'none'; document.getElementById('2311.15269v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper is accepted by HPCA 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.12592">arXiv:2311.12592</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.12592">pdf</a>, <a href="https://arxiv.org/format/2311.12592">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Visual tracking brain computer interface </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Changxing Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+N">Nanlin Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yining Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiaogang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yijun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+X">Xiaorong Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.12592v1-abstract-short" style="display: inline;"> Brain-computer interfaces (BCIs) offer a way to interact with computers without relying on physical movements. Non-invasive electroencephalography (EEG)-based visual BCIs, known for efficient speed and calibration ease, face limitations in continuous tasks due to discrete stimulus design and decoding methods. To achieve continuous control, we implemented a novel spatial encoding stimulus paradigm&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12592v1-abstract-full').style.display = 'inline'; document.getElementById('2311.12592v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.12592v1-abstract-full" style="display: none;"> Brain-computer interfaces (BCIs) offer a way to interact with computers without relying on physical movements. Non-invasive electroencephalography (EEG)-based visual BCIs, known for efficient speed and calibration ease, face limitations in continuous tasks due to discrete stimulus design and decoding methods. To achieve continuous control, we implemented a novel spatial encoding stimulus paradigm and devised a corresponding projection method to enable continuous modulation of decoded velocity. Subsequently, we conducted experiments involving 17 participants and achieved Fitt&#39;s ITR of 0.55 bps for the fixed tracking task and 0.37 bps for the random tracking task. The proposed BCI with a high Fitt&#39;s ITR was then integrated into two applications, including painting and gaming. In conclusion, this study proposed a visual BCI-based control method to go beyond discrete commands, allowing natural continuous control based on neural activity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12592v1-abstract-full').style.display = 'none'; document.getElementById('2311.12592v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.11596">arXiv:2311.11596</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.11596">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> High-performance cVEP-BCI under minimal calibration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yining Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+N">Nanlin Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Changxing Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yonghao Song</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiaogang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yijun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+X">Xiaorong Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.11596v1-abstract-short" style="display: inline;"> The ultimate goal of brain-computer interfaces (BCIs) based on visual modulation paradigms is to achieve high-speed performance without the burden of extensive calibration. Code-modulated visual evoked potential-based BCIs (cVEP-BCIs) modulated by broadband white noise (WN) offer various advantages, including increased communication speed, expanded encoding target capabilities, and enhanced coding&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11596v1-abstract-full').style.display = 'inline'; document.getElementById('2311.11596v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.11596v1-abstract-full" style="display: none;"> The ultimate goal of brain-computer interfaces (BCIs) based on visual modulation paradigms is to achieve high-speed performance without the burden of extensive calibration. Code-modulated visual evoked potential-based BCIs (cVEP-BCIs) modulated by broadband white noise (WN) offer various advantages, including increased communication speed, expanded encoding target capabilities, and enhanced coding flexibility. However, the complexity of the spatial-temporal patterns under broadband stimuli necessitates extensive calibration for effective target identification in cVEP-BCIs. Consequently, the information transfer rate (ITR) of cVEP-BCI under limited calibration usually stays around 100 bits per minute (bpm), significantly lagging behind state-of-the-art steady-state visual evoked potential-based BCIs (SSVEP-BCIs), which achieve rates above 200 bpm. To enhance the performance of cVEP-BCIs with minimal calibration, we devised an efficient calibration stage involving a brief single-target flickering, lasting less than a minute, to extract generalizable spatial-temporal patterns. Leveraging the calibration data, we developed two complementary methods to construct cVEP temporal patterns: the linear modeling method based on the stimulus sequence and the transfer learning techniques using cross-subject data. As a result, we achieved the highest ITR of 250 bpm under a minute of calibration, which has been shown to be comparable to the state-of-the-art SSVEP paradigms. In summary, our work significantly improved the cVEP performance under few-shot learning, which is expected to expand the practicality and usability of cVEP-BCIs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11596v1-abstract-full').style.display = 'none'; document.getElementById('2311.11596v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.07608">arXiv:2311.07608</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.07608">pdf</a>, <a href="https://arxiv.org/format/2311.07608">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MuST: Multimodal Spatiotemporal Graph-Transformer for Hospital Readmission Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yan Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+L">Lequan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.07608v1-abstract-short" style="display: inline;"> Hospital readmission prediction is considered an essential approach to decreasing readmission rates, which is a key factor in assessing the quality and efficacy of a healthcare system. Previous studies have extensively utilized three primary modalities, namely electronic health records (EHR), medical images, and clinical notes, to predict hospital readmissions. However, the majority of these studi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07608v1-abstract-full').style.display = 'inline'; document.getElementById('2311.07608v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.07608v1-abstract-full" style="display: none;"> Hospital readmission prediction is considered an essential approach to decreasing readmission rates, which is a key factor in assessing the quality and efficacy of a healthcare system. Previous studies have extensively utilized three primary modalities, namely electronic health records (EHR), medical images, and clinical notes, to predict hospital readmissions. However, the majority of these studies did not integrate information from all three modalities or utilize the spatiotemporal relationships present in the dataset. This study introduces a novel model called the Multimodal Spatiotemporal Graph-Transformer (MuST) for predicting hospital readmissions. By employing Graph Convolution Networks and temporal transformers, we can effectively capture spatial and temporal dependencies in EHR and chest radiographs. We then propose a fusion transformer to combine the spatiotemporal features from the two modalities mentioned above with the features from clinical notes extracted by a pre-trained, domain-specific transformer. We assess the effectiveness of our methods using the latest publicly available dataset, MIMIC-IV. The experimental results indicate that the inclusion of multimodal features in MuST improves its performance in comparison to unimodal methods. Furthermore, our proposed pipeline outperforms the current leading methods in the prediction of hospital readmissions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07608v1-abstract-full').style.display = 'none'; document.getElementById('2311.07608v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.06517">arXiv:2311.06517</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.06517">pdf</a>, <a href="https://arxiv.org/format/2311.06517">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> BClean: A Bayesian Data Cleaning System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qin%2C+J">Jianbin Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Sifan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yaoshu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jing Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yukai Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+R">Rui Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Onizuka%2C+M">Makoto Onizuka</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+C">Chuan Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.06517v1-abstract-short" style="display: inline;"> There is a considerable body of work on data cleaning which employs various principles to rectify erroneous data and transform a dirty dataset into a cleaner one. One of prevalent approaches is probabilistic methods, including Bayesian methods. However, existing probabilistic methods often assume a simplistic distribution (e.g., Gaussian distribution), which is frequently underfitted in practice,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.06517v1-abstract-full').style.display = 'inline'; document.getElementById('2311.06517v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.06517v1-abstract-full" style="display: none;"> There is a considerable body of work on data cleaning which employs various principles to rectify erroneous data and transform a dirty dataset into a cleaner one. One of prevalent approaches is probabilistic methods, including Bayesian methods. However, existing probabilistic methods often assume a simplistic distribution (e.g., Gaussian distribution), which is frequently underfitted in practice, or they necessitate experts to provide a complex prior distribution (e.g., via a programming language). This requirement is both labor-intensive and costly, rendering these methods less suitable for real-world applications. In this paper, we propose BClean, a Bayesian Cleaning system that features automatic Bayesian network construction and user interaction. We recast the data cleaning problem as a Bayesian inference that fully exploits the relationships between attributes in the observed dataset and any prior information provided by users. To this end, we present an automatic Bayesian network construction method that extends a structure learning-based functional dependency discovery method with similarity functions to capture the relationships between attributes. Furthermore, our system allows users to modify the generated Bayesian network in order to specify prior information or correct inaccuracies identified by the automatic generation process. We also design an effective scoring model (called the compensative scoring model) necessary for the Bayesian inference. To enhance the efficiency of data cleaning, we propose several approximation strategies for the Bayesian inference, including graph partitioning, domain pruning, and pre-detection. By evaluating on both real-world and synthetic datasets, we demonstrate that BClean is capable of achieving an F-measure of up to 0.9 in data cleaning, outperforming existing Bayesian methods by 2% and other data cleaning methods by 15%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.06517v1-abstract-full').style.display = 'none'; document.getElementById('2311.06517v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our source code is available at https://github.com/yyssl88/BClean</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Miao%2C+Y&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Miao%2C+Y&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Miao%2C+Y&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Miao%2C+Y&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Miao%2C+Y&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10