CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,095 results for author: <span class="mathjax">Han, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Han%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Han, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Han%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Han, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17217">arXiv:2411.17217</a> <span> [<a href="https://arxiv.org/pdf/2411.17217">pdf</a>, <a href="https://arxiv.org/format/2411.17217">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Promptable Anomaly Segmentation with SAM Through Self-Perception Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hui-Yue Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zijia Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yongliang Tang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Pengcheng Gao</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+Y">Yuming Quan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jungong Han</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+G">Guiguang Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17217v2-abstract-short" style="display: inline;"> Segment Anything Model (SAM) has made great progress in anomaly segmentation tasks due to its impressive generalization ability. However, existing methods that directly apply SAM through prompting often overlook the domain shift issue, where SAM performs well on natural images but struggles in industrial scenarios. Parameter-Efficient Fine-Tuning (PEFT) offers a promising solution, but it may yiel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17217v2-abstract-full').style.display = 'inline'; document.getElementById('2411.17217v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17217v2-abstract-full" style="display: none;"> Segment Anything Model (SAM) has made great progress in anomaly segmentation tasks due to its impressive generalization ability. However, existing methods that directly apply SAM through prompting often overlook the domain shift issue, where SAM performs well on natural images but struggles in industrial scenarios. Parameter-Efficient Fine-Tuning (PEFT) offers a promising solution, but it may yield suboptimal performance by not adequately addressing the perception challenges during adaptation to anomaly images. In this paper, we propose a novel Self-Perceptinon Tuning (SPT) method, aiming to enhance SAM's perception capability for anomaly segmentation. The SPT method incorporates a self-drafting tuning strategy, which generates an initial coarse draft of the anomaly mask, followed by a refinement process. Additionally, a visual-relation-aware adapter is introduced to improve the perception of discriminative relational information for mask generation. Extensive experimental results on several benchmark datasets demonstrate that our SPT method can significantly outperform baseline methods, validating its effectiveness. Models and codes will be available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17217v2-abstract-full').style.display = 'none'; document.getElementById('2411.17217v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16666">arXiv:2411.16666</a> <span> [<a href="https://arxiv.org/pdf/2411.16666">pdf</a>, <a href="https://arxiv.org/format/2411.16666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistical Finance">q-fin.ST</span> </div> </div> <p class="title is-5 mathjax"> CatNet: Effective FDR Control in LSTM with Gaussian Mirrors and SHAP Feature Importance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiaan Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junxiao Chen</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanzhe Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16666v2-abstract-short" style="display: inline;"> We introduce CatNet, an algorithm that effectively controls False Discovery Rate (FDR) and selects significant features in LSTM with the Gaussian Mirror (GM) method. To evaluate the feature importance of LSTM in time series, we introduce a vector of the derivative of the SHapley Additive exPlanations (SHAP) to measure feature importance. We also propose a new kernel-based dependence measure to avo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16666v2-abstract-full').style.display = 'inline'; document.getElementById('2411.16666v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16666v2-abstract-full" style="display: none;"> We introduce CatNet, an algorithm that effectively controls False Discovery Rate (FDR) and selects significant features in LSTM with the Gaussian Mirror (GM) method. To evaluate the feature importance of LSTM in time series, we introduce a vector of the derivative of the SHapley Additive exPlanations (SHAP) to measure feature importance. We also propose a new kernel-based dependence measure to avoid multicollinearity in the GM algorithm, to make a robust feature selection with controlled FDR. We use simulated data to evaluate CatNet's performance in both linear models and LSTM models with different link functions. The algorithm effectively controls the FDR while maintaining a high statistical power in all cases. We also evaluate the algorithm's performance in different low-dimensional and high-dimensional cases, demonstrating its robustness in various input dimensions. To evaluate CatNet's performance in real world applications, we construct a multi-factor investment portfolio to forecast the prices of S\&P 500 index components. The results demonstrate that our model achieves superior predictive accuracy compared to traditional LSTM models without feature selection and FDR control. Additionally, CatNet effectively captures common market-driving features, which helps informed decision-making in financial markets by enhancing the interpretability of predictions. Our study integrates of the Gaussian Mirror algorithm with LSTM models for the first time, and introduces SHAP values as a new feature importance metric for FDR control methods, marking a significant advancement in feature selection and error control for neural networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16666v2-abstract-full').style.display = 'none'; document.getElementById('2411.16666v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14788">arXiv:2411.14788</a> <span> [<a href="https://arxiv.org/pdf/2411.14788">pdf</a>, <a href="https://arxiv.org/format/2411.14788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Jovis: A Visualization Tool for PostgreSQL Query Optimizer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yoojin Choi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Juhee Han</a>, <a href="/search/cs?searchtype=author&query=Koo%2C+K">Kyoseung Koo</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+B">Bongki Moon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14788v1-abstract-short" style="display: inline;"> In the world of relational database management, the query optimizer is a critical component that significantly impacts query performance. To address the challenge of optimizing query performance due to the complexity of optimizers -- especially with join operations -- we introduce Jovis. This novel visualization tool provides a window into the often intricate process of query optimization in Postg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14788v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14788v1-abstract-full" style="display: none;"> In the world of relational database management, the query optimizer is a critical component that significantly impacts query performance. To address the challenge of optimizing query performance due to the complexity of optimizers -- especially with join operations -- we introduce Jovis. This novel visualization tool provides a window into the often intricate process of query optimization in PostgreSQL, making it more accessible and understandable. PostgreSQL employs two different query optimization strategies: the Dynamic Programming (DP) Optimizer for most scenarios and the Genetic Query Optimizer (GEQO) for more complex queries with numerous joins, both of which are supported in Jovis. Our tool visualizes the optimizer's decision-making process, from evaluating access paths for each relation to determining join orderings, all using data derived from the optimizer's logs. Jovis not only clarifies the query optimization process through visualizations but also serves as an invaluable learning tool for learners and a practical resource for experienced database professionals looking to optimize their query performance or even the query optimizer itself. The source code has been made available at https://github.com/snu-jovis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14788v1-abstract-full').style.display = 'none'; document.getElementById('2411.14788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12309">arXiv:2411.12309</a> <span> [<a href="https://arxiv.org/pdf/2411.12309">pdf</a>, <a href="https://arxiv.org/format/2411.12309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DGTR: Distributed Gaussian Turbo-Reconstruction for Sparse-View Vast Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuanyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Haosong Peng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenming Wu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Weicai Ye</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yufeng Zhan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dingwen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Junwei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12309v2-abstract-short" style="display: inline;"> Novel-view synthesis (NVS) approaches play a critical role in vast scene reconstruction. However, these methods rely heavily on dense image inputs and prolonged training times, making them unsuitable where computational resources are limited. Additionally, few-shot methods often struggle with poor reconstruction quality in vast environments. This paper presents DGTR, a novel distributed framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12309v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12309v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12309v2-abstract-full" style="display: none;"> Novel-view synthesis (NVS) approaches play a critical role in vast scene reconstruction. However, these methods rely heavily on dense image inputs and prolonged training times, making them unsuitable where computational resources are limited. Additionally, few-shot methods often struggle with poor reconstruction quality in vast environments. This paper presents DGTR, a novel distributed framework for efficient Gaussian reconstruction for sparse-view vast scenes. Our approach divides the scene into regions, processed independently by drones with sparse image inputs. Using a feed-forward Gaussian model, we predict high-quality Gaussian primitives, followed by a global alignment algorithm to ensure geometric consistency. Synthetic views and depth priors are incorporated to further enhance training, while a distillation-based model aggregation mechanism enables efficient reconstruction. Our method achieves high-quality large-scale scene reconstruction and novel-view synthesis in significantly reduced training times, outperforming existing approaches in both speed and scalability. We demonstrate the effectiveness of our framework on vast aerial scenes, achieving high-quality results within minutes. Code will released on our [https://3d-aigc.github.io/DGTR]. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12309v2-abstract-full').style.display = 'none'; document.getElementById('2411.12309v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code will released on our [https://3d-aigc.github.io/DGTR]</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12301">arXiv:2411.12301</a> <span> [<a href="https://arxiv.org/pdf/2411.12301">pdf</a>, <a href="https://arxiv.org/format/2411.12301">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Physics-Guided Detector for SAR Airplanes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhongling Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Long Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuxin Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhirui Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Gong Cheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Junwei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12301v1-abstract-short" style="display: inline;"> The disperse structure distributions (discreteness) and variant scattering characteristics (variability) of SAR airplane targets lead to special challenges of object detection and recognition. The current deep learning-based detectors encounter challenges in distinguishing fine-grained SAR airplanes against complex backgrounds. To address it, we propose a novel physics-guided detector (PGD) learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12301v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12301v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12301v1-abstract-full" style="display: none;"> The disperse structure distributions (discreteness) and variant scattering characteristics (variability) of SAR airplane targets lead to special challenges of object detection and recognition. The current deep learning-based detectors encounter challenges in distinguishing fine-grained SAR airplanes against complex backgrounds. To address it, we propose a novel physics-guided detector (PGD) learning paradigm for SAR airplanes that comprehensively investigate their discreteness and variability to improve the detection performance. It is a general learning paradigm that can be extended to different existing deep learning-based detectors with "backbone-neck-head" architectures. The main contributions of PGD include the physics-guided self-supervised learning, feature enhancement, and instance perception, denoted as PGSSL, PGFE, and PGIP, respectively. PGSSL aims to construct a self-supervised learning task based on a wide range of SAR airplane targets that encodes the prior knowledge of various discrete structure distributions into the embedded space. Then, PGFE enhances the multi-scale feature representation of a detector, guided by the physics-aware information learned from PGSSL. PGIP is constructed at the detection head to learn the refined and dominant scattering point of each SAR airplane instance, thus alleviating the interference from the complex background. We propose two implementations, denoted as PGD and PGD-Lite, and apply them to various existing detectors with different backbones and detection heads. The experiments demonstrate the flexibility and effectiveness of the proposed PGD, which can improve existing detectors on SAR airplane detection with fine-grained classification task (an improvement of 3.1\% mAP most), and achieve the state-of-the-art performance (90.7\% mAP) on SAR-AIRcraft-1.0 dataset. The project is open-source at \url{https://github.com/XAI4SAR/PGD}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12301v1-abstract-full').style.display = 'none'; document.getElementById('2411.12301v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12220">arXiv:2411.12220</a> <span> [<a href="https://arxiv.org/pdf/2411.12220">pdf</a>, <a href="https://arxiv.org/format/2411.12220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> DeTrigger: A Gradient-Centric Approach to Backdoor Attack Mitigation in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kichang Lee</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+Y">Yujin Shin</a>, <a href="/search/cs?searchtype=author&query=Yun%2C+J">Jonghyuk Yun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jun Han</a>, <a href="/search/cs?searchtype=author&query=Ko%2C+J">JeongGil Ko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12220v1-abstract-short" style="display: inline;"> Federated Learning (FL) enables collaborative model training across distributed devices while preserving local data privacy, making it ideal for mobile and embedded systems. However, the decentralized nature of FL also opens vulnerabilities to model poisoning attacks, particularly backdoor attacks, where adversaries implant trigger patterns to manipulate model predictions. In this paper, we propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12220v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12220v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12220v1-abstract-full" style="display: none;"> Federated Learning (FL) enables collaborative model training across distributed devices while preserving local data privacy, making it ideal for mobile and embedded systems. However, the decentralized nature of FL also opens vulnerabilities to model poisoning attacks, particularly backdoor attacks, where adversaries implant trigger patterns to manipulate model predictions. In this paper, we propose DeTrigger, a scalable and efficient backdoor-robust federated learning framework that leverages insights from adversarial attack methodologies. By employing gradient analysis with temperature scaling, DeTrigger detects and isolates backdoor triggers, allowing for precise model weight pruning of backdoor activations without sacrificing benign model knowledge. Extensive evaluations across four widely used datasets demonstrate that DeTrigger achieves up to 251x faster detection than traditional methods and mitigates backdoor attacks by up to 98.9%, with minimal impact on global model accuracy. Our findings establish DeTrigger as a robust and scalable solution to protect federated learning environments against sophisticated backdoor threats. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12220v1-abstract-full').style.display = 'none'; document.getElementById('2411.12220v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.11 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11930">arXiv:2411.11930</a> <span> [<a href="https://arxiv.org/pdf/2411.11930">pdf</a>, <a href="https://arxiv.org/format/2411.11930">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AtomThink: A Slow Thinking Framework for Multimodal Mathematical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+K">Kun Xiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhili Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihao Jiang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Y">Yunshuang Nie</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Runhui Huang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Haoxiang Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hanhui Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiran Huang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yihan Zeng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jianhua Han</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11930v2-abstract-short" style="display: inline;"> In this paper, we address the challenging task of multimodal mathematical reasoning by incorporating the ability of ``slow thinking" into multimodal large language models (MLLMs). Contrary to existing methods that rely on direct or fast thinking, our key idea is to construct long chains of thought (CoT) consisting of atomic actions in a step-by-step manner, guiding MLLMs to perform complex reasoni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11930v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11930v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11930v2-abstract-full" style="display: none;"> In this paper, we address the challenging task of multimodal mathematical reasoning by incorporating the ability of ``slow thinking" into multimodal large language models (MLLMs). Contrary to existing methods that rely on direct or fast thinking, our key idea is to construct long chains of thought (CoT) consisting of atomic actions in a step-by-step manner, guiding MLLMs to perform complex reasoning. To this end, we design a novel AtomThink framework composed of three key modules: (i) a CoT annotation engine that automatically generates high-quality CoT annotations to address the lack of high-quality visual mathematical data; (ii) an atomic step fine-tuning strategy that jointly optimizes an MLLM and a policy reward model (PRM) for step-wise reasoning; and (iii) four different search strategies that can be applied with the PRM to complete reasoning. Additionally, we propose AtomMATH, a large-scale multimodal dataset of long CoTs, and an atomic capability evaluation metric for mathematical tasks. Extensive experimental results show that the proposed AtomThink significantly improves the performance of baseline MLLMs, achieving approximately 50\% relative accuracy gains on MathVista and 120\% on MathVerse. To support the advancement of multimodal slow-thinking models, we will make our code and dataset publicly available on https://github.com/Quinn777/AtomThink. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11930v2-abstract-full').style.display = 'none'; document.getElementById('2411.11930v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11407">arXiv:2411.11407</a> <span> [<a href="https://arxiv.org/pdf/2411.11407">pdf</a>, <a href="https://arxiv.org/format/2411.11407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Dark Side of Trust: Authority Citation-Driven Jailbreak Attacks on Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xikang Yang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xuehai Tang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jizhong Han</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Songlin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11407v1-abstract-short" style="display: inline;"> The widespread deployment of large language models (LLMs) across various domains has showcased their immense potential while exposing significant safety vulnerabilities. A major concern is ensuring that LLM-generated content aligns with human values. Existing jailbreak techniques reveal how this alignment can be compromised through specific prompts or adversarial suffixes. In this study, we introd… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11407v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11407v1-abstract-full" style="display: none;"> The widespread deployment of large language models (LLMs) across various domains has showcased their immense potential while exposing significant safety vulnerabilities. A major concern is ensuring that LLM-generated content aligns with human values. Existing jailbreak techniques reveal how this alignment can be compromised through specific prompts or adversarial suffixes. In this study, we introduce a new threat: LLMs' bias toward authority. While this inherent bias can improve the quality of outputs generated by LLMs, it also introduces a potential vulnerability, increasing the risk of producing harmful content. Notably, the biases in LLMs is the varying levels of trust given to different types of authoritative information in harmful queries. For example, malware development often favors trust GitHub. To better reveal the risks with LLM, we propose DarkCite, an adaptive authority citation matcher and generator designed for a black-box setting. DarkCite matches optimal citation types to specific risk types and generates authoritative citations relevant to harmful instructions, enabling more effective jailbreak attacks on aligned LLMs.Our experiments show that DarkCite achieves a higher attack success rate (e.g., LLama-2 at 76% versus 68%) than previous methods. To counter this risk, we propose an authenticity and harm verification defense strategy, raising the average defense pass rate (DPR) from 11% to 74%. More importantly, the ability to link citations to the content they encompass has become a foundational function in LLMs, amplifying the influence of LLMs' bias toward authority. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11407v1-abstract-full').style.display = 'none'; document.getElementById('2411.11407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10294">arXiv:2411.10294</a> <span> [<a href="https://arxiv.org/pdf/2411.10294">pdf</a>, <a href="https://arxiv.org/format/2411.10294">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> </div> <p class="title is-5 mathjax"> Static network structure cannot stabilize cooperation among Large Language Model agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J">Jin Han</a>, <a href="/search/cs?searchtype=author&query=Battu%2C+B">Balaraju Battu</a>, <a href="/search/cs?searchtype=author&query=Romi%C4%87%2C+I">Ivan Romi膰</a>, <a href="/search/cs?searchtype=author&query=Rahwan%2C+T">Talal Rahwan</a>, <a href="/search/cs?searchtype=author&query=Holme%2C+P">Petter Holme</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10294v1-abstract-short" style="display: inline;"> Large language models (LLMs) are increasingly used to model human social behavior, with recent research exploring their ability to simulate social dynamics. Here, we test whether LLMs mirror human behavior in social dilemmas, where individual and collective interests conflict. Humans generally cooperate more than expected in laboratory settings, showing less cooperation in well-mixed populations b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10294v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10294v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10294v1-abstract-full" style="display: none;"> Large language models (LLMs) are increasingly used to model human social behavior, with recent research exploring their ability to simulate social dynamics. Here, we test whether LLMs mirror human behavior in social dilemmas, where individual and collective interests conflict. Humans generally cooperate more than expected in laboratory settings, showing less cooperation in well-mixed populations but more in fixed networks. In contrast, LLMs tend to exhibit greater cooperation in well-mixed settings. This raises a key question: Are LLMs about to emulate human behavior in cooperative dilemmas on networks? In this study, we examine networked interactions where agents repeatedly engage in the Prisoner's Dilemma within both well-mixed and structured network configurations, aiming to identify parallels in cooperative behavior between LLMs and humans. Our findings indicate critical distinctions: while humans tend to cooperate more within structured networks, LLMs display increased cooperation mainly in well-mixed environments, with limited adjustment to networked contexts. Notably, LLM cooperation also varies across model types, illustrating the complexities of replicating human-like social adaptability in artificial agents. These results highlight a crucial gap: LLMs struggle to emulate the nuanced, adaptive social strategies humans deploy in fixed networks. Unlike human participants, LLMs do not alter their cooperative behavior in response to network structures or evolving social contexts, missing the reciprocity norms that humans adaptively employ. This limitation points to a fundamental need in future LLM design -- to integrate a deeper comprehension of social norms, enabling more authentic modeling of human-like cooperation and adaptability in networked environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10294v1-abstract-full').style.display = 'none'; document.getElementById('2411.10294v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10005">arXiv:2411.10005</a> <span> [<a href="https://arxiv.org/pdf/2411.10005">pdf</a>, <a href="https://arxiv.org/format/2411.10005">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Analyzing Performance Characteristics of PostgreSQL and MariaDB on NVMeVirt </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J">Juhee Han</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yoojin Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10005v1-abstract-short" style="display: inline;"> The NVMeVirt paper analyzes the implication of storage performance on database engine performance to promote the tunable performance of NVMeVirt. They perform analysis on two very popular database engines, MariaDB and PostgreSQL. The result shows that MariaDB is more efficient when the storage is slow, but PostgreSQL outperforms MariaDB as I/O bandwidth increases. Although this verifies that NVMeV… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10005v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10005v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10005v1-abstract-full" style="display: none;"> The NVMeVirt paper analyzes the implication of storage performance on database engine performance to promote the tunable performance of NVMeVirt. They perform analysis on two very popular database engines, MariaDB and PostgreSQL. The result shows that MariaDB is more efficient when the storage is slow, but PostgreSQL outperforms MariaDB as I/O bandwidth increases. Although this verifies that NVMeVirt can support advanced storage bandwidth configurations, the paper does not provide a clear explanation of why two database engines react very differently to the storage performance. To understand why the above two database engines have different performance characteristics, we conduct a study of the database engine's internals. We focus on three major differences in Multi-version concurrency control (MVCC) implementations: version storage, garbage collection, and index management. We also evaluated each scheme's I/O overhead using OLTP workload. Our analysis identifies the reason why MariaDB outperforms PostgreSQL when the bandwidth is low. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10005v1-abstract-full').style.display = 'none'; document.getElementById('2411.10005v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09997">arXiv:2411.09997</a> <span> [<a href="https://arxiv.org/pdf/2411.09997">pdf</a>, <a href="https://arxiv.org/format/2411.09997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> DBenVis: A Visual Analytics System for Comparing DBMS Performance via Benchmark Programs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yoojin Choi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Juhee Han</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Daehyun Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09997v1-abstract-short" style="display: inline;"> Database benchmarking is an essential method for evaluating and comparing the performance characteristics of a database management system (DBMS). It helps researchers and developers to evaluate the efficacy of their optimizations or newly developed DBMS solutions. Also, companies can benefit by analyzing the performance of DBMS under specific workloads and leveraging the result to select the most… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09997v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09997v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09997v1-abstract-full" style="display: none;"> Database benchmarking is an essential method for evaluating and comparing the performance characteristics of a database management system (DBMS). It helps researchers and developers to evaluate the efficacy of their optimizations or newly developed DBMS solutions. Also, companies can benefit by analyzing the performance of DBMS under specific workloads and leveraging the result to select the most suitable system for their needs. The proper interpretation of raw benchmark results requires effective visualization, which helps users gain meaningful insights. However, visualization of the results requires prior knowledge, and existing approaches often involve time-consuming manual tasks. This is due to the absence of a unified visual analytics system for benchmark results across diverse DBMSs. To address these challenges, we present DBenVis, an interactive visual analytics system that provides efficient and versatile benchmark results visualization. DBenVis is designed to support both online transaction processing (OLTP) and online analytic processing (OLAP) benchmarks. DBenVis provides an interactive comparison view, which enables users to perform an in-depth analysis of performance characteristics across various metrics among different DBMSs. Notably, we devise an interactive visual encoding idiom for the OLAP benchmark to represent a query execution plan as a tree. In the process of building a system, we propose novel techniques for parsing meaningful data from raw benchmark results and converting the query plan to a D3 hierarchical format. Through case studies conducted with domain experts, we demonstrate the efficacy and usability of DBenVis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09997v1-abstract-full').style.display = 'none'; document.getElementById('2411.09997v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09339">arXiv:2411.09339</a> <span> [<a href="https://arxiv.org/pdf/2411.09339">pdf</a>, <a href="https://arxiv.org/format/2411.09339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Re-Parameterization of Lightweight Transformer for On-Device Speech Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixing Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhongren Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weixiang Xu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jing Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09339v1-abstract-short" style="display: inline;"> With the increasing implementation of machine learning models on edge or Internet-of-Things (IoT) devices, deploying advanced models on resource-constrained IoT devices remains challenging. Transformer models, a currently dominant neural architecture, have achieved great success in broad domains but their complexity hinders its deployment on IoT devices with limited computation capability and stor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09339v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09339v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09339v1-abstract-full" style="display: none;"> With the increasing implementation of machine learning models on edge or Internet-of-Things (IoT) devices, deploying advanced models on resource-constrained IoT devices remains challenging. Transformer models, a currently dominant neural architecture, have achieved great success in broad domains but their complexity hinders its deployment on IoT devices with limited computation capability and storage size. Although many model compression approaches have been explored, they often suffer from notorious performance degradation. To address this issue, we introduce a new method, namely Transformer Re-parameterization, to boost the performance of lightweight Transformer models. It consists of two processes: the High-Rank Factorization (HRF) process in the training stage and the deHigh-Rank Factorization (deHRF) process in the inference stage. In the former process, we insert an additional linear layer before the Feed-Forward Network (FFN) of the lightweight Transformer. It is supposed that the inserted HRF layers can enhance the model learning capability. In the later process, the auxiliary HRF layer will be merged together with the following FFN layer into one linear layer and thus recover the original structure of the lightweight model. To examine the effectiveness of the proposed method, we evaluate it on three widely used Transformer variants, i.e., ConvTransformer, Conformer, and SpeechFormer networks, in the application of speech emotion recognition on the IEMOCAP, M3ED and DAIC-WOZ datasets. Experimental results show that our proposed method consistently improves the performance of lightweight Transformers, even making them comparable to large models. The proposed re-parameterization approach enables advanced Transformer models to be deployed on resource-constrained IoT devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09339v1-abstract-full').style.display = 'none'; document.getElementById('2411.09339v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09153">arXiv:2411.09153</a> <span> [<a href="https://arxiv.org/pdf/2411.09153">pdf</a>, <a href="https://arxiv.org/format/2411.09153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> VidMan: Exploiting Implicit Dynamics from Video Diffusion Model for Effective Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Youpeng Wen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junfan Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yi Zhu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jianhua Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shen Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09153v1-abstract-short" style="display: inline;"> Recent advancements utilizing large-scale video data for learning video generation models demonstrate significant potential in understanding complex physical dynamics. It suggests the feasibility of leveraging diverse robot trajectory data to develop a unified, dynamics-aware model to enhance robot manipulation. However, given the relatively small amount of available robot data, directly fitting d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09153v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09153v1-abstract-full" style="display: none;"> Recent advancements utilizing large-scale video data for learning video generation models demonstrate significant potential in understanding complex physical dynamics. It suggests the feasibility of leveraging diverse robot trajectory data to develop a unified, dynamics-aware model to enhance robot manipulation. However, given the relatively small amount of available robot data, directly fitting data without considering the relationship between visual observations and actions could lead to suboptimal data utilization. To this end, we propose VidMan (Video Diffusion for Robot Manipulation), a novel framework that employs a two-stage training mechanism inspired by dual-process theory from neuroscience to enhance stability and improve data utilization efficiency. Specifically, in the first stage, VidMan is pre-trained on the Open X-Embodiment dataset (OXE) for predicting future visual trajectories in a video denoising diffusion manner, enabling the model to develop a long horizontal awareness of the environment's dynamics. In the second stage, a flexible yet effective layer-wise self-attention adapter is introduced to transform VidMan into an efficient inverse dynamics model that predicts action modulated by the implicit dynamics knowledge via parameter sharing. Our VidMan framework outperforms state-of-the-art baseline model GR-1 on the CALVIN benchmark, achieving a 11.7% relative improvement, and demonstrates over 9% precision gains on the OXE small-scale dataset. These results provide compelling evidence that world models can significantly enhance the precision of robot action prediction. Codes and models will be public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09153v1-abstract-full').style.display = 'none'; document.getElementById('2411.09153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08286">arXiv:2411.08286</a> <span> [<a href="https://arxiv.org/pdf/2411.08286">pdf</a>, <a href="https://arxiv.org/format/2411.08286">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Hashing for Protein Structure Similarity Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J">Jin Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wu-Jun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08286v1-abstract-short" style="display: inline;"> Protein structure similarity search (PSSS), which tries to search proteins with similar structures, plays a crucial role across diverse domains from drug design to protein function prediction and molecular evolution. Traditional alignment-based PSSS methods, which directly calculate alignment on the protein structures, are highly time-consuming with high memory cost. Recently, alignment-free metho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08286v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08286v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08286v1-abstract-full" style="display: none;"> Protein structure similarity search (PSSS), which tries to search proteins with similar structures, plays a crucial role across diverse domains from drug design to protein function prediction and molecular evolution. Traditional alignment-based PSSS methods, which directly calculate alignment on the protein structures, are highly time-consuming with high memory cost. Recently, alignment-free methods, which represent protein structures as fixed-length real-valued vectors, are proposed for PSSS. Although these methods have lower time and memory cost than alignment-based methods, their time and memory cost is still too high for large-scale PSSS, and their accuracy is unsatisfactory. In this paper, we propose a novel method, called $\underline{\text{p}}$r$\underline{\text{o}}$tein $\underline{\text{s}}$tructure $\underline{\text{h}}$ashing (POSH), for PSSS. POSH learns a binary vector representation for each protein structure, which can dramatically reduce the time and memory cost for PSSS compared with real-valued vector representation based methods. Furthermore, in POSH we also propose expressive hand-crafted features and a structure encoder to well model both node and edge interactions in proteins. Experimental results on real datasets show that POSH can outperform other methods to achieve state-of-the-art accuracy. Furthermore, POSH achieves a memory saving of more than six times and speed improvement of more than four times, compared with other methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08286v1-abstract-full').style.display = 'none'; document.getElementById('2411.08286v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06764">arXiv:2411.06764</a> <span> [<a href="https://arxiv.org/pdf/2411.06764">pdf</a>, <a href="https://arxiv.org/format/2411.06764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Stage Knowledge Integration of Vision-Language Models for Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongsheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Zhong Ji</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingren Liu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yanwei Pang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jungong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06764v1-abstract-short" style="display: inline;"> Vision Language Models (VLMs), pre-trained on large-scale image-text datasets, enable zero-shot predictions for unseen data but may underperform on specific unseen tasks. Continual learning (CL) can help VLMs effectively adapt to new data distributions without joint training, but faces challenges of catastrophic forgetting and generalization forgetting. Although significant progress has been achie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06764v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06764v1-abstract-full" style="display: none;"> Vision Language Models (VLMs), pre-trained on large-scale image-text datasets, enable zero-shot predictions for unseen data but may underperform on specific unseen tasks. Continual learning (CL) can help VLMs effectively adapt to new data distributions without joint training, but faces challenges of catastrophic forgetting and generalization forgetting. Although significant progress has been achieved by distillation-based methods, they exhibit two severe limitations. One is the popularly adopted single-teacher paradigm fails to impart comprehensive knowledge, The other is the existing methods inadequately leverage the multimodal information in the original training dataset, instead they rely on additional data for distillation, which increases computational and storage overhead. To mitigate both limitations, by drawing on Knowledge Integration Theory (KIT), we propose a Multi-Stage Knowledge Integration network (MulKI) to emulate the human learning process in distillation methods. MulKI achieves this through four stages, including Eliciting Ideas, Adding New Ideas, Distinguishing Ideas, and Making Connections. During the four stages, we first leverage prototypes to align across modalities, eliciting cross-modal knowledge, then adding new knowledge by constructing fine-grained intra- and inter-modality relationships with prototypes. After that, knowledge from two teacher models is adaptively distinguished and re-weighted. Finally, we connect between models from intra- and inter-task, integrating preceding and new knowledge. Our method demonstrates significant improvements in maintaining zero-shot capabilities while supporting continual learning across diverse downstream tasks, showcasing its potential in adapting VLMs to evolving data distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06764v1-abstract-full').style.display = 'none'; document.getElementById('2411.06764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05361">arXiv:2411.05361</a> <span> [<a href="https://arxiv.org/pdf/2411.05361">pdf</a>, <a href="https://arxiv.org/format/2411.05361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-SUPERB Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/cs?searchtype=author&query=Diwan%2C+A">Anuj Diwan</a>, <a href="/search/cs?searchtype=author&query=Shih%2C+Y">Yi-Jen Shih</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Hsiao%2C+C">Chi-Yuan Hsiao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+P">Puyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shih-Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Ritter-Gutierrez%2C+F">Fabian Ritter-Gutierrez</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+M+T">Ming To Chuang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kuan-Po Huang</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">You-Kuan Lin</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+E">Eunjung Yeo</a> , et al. (53 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05361v1-abstract-short" style="display: inline;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05361v1-abstract-full" style="display: none;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluation benchmark poses a significant challenge. We present Dynamic-SUPERB Phase-2, an open and evolving benchmark for the comprehensive evaluation of instruction-based universal speech models. Building upon the first generation, this second version incorporates 125 new tasks contributed collaboratively by the global research community, expanding the benchmark to a total of 180 tasks, making it the largest benchmark for speech and audio evaluation. While the first generation of Dynamic-SUPERB was limited to classification tasks, Dynamic-SUPERB Phase-2 broadens its evaluation capabilities by introducing a wide array of novel and diverse tasks, including regression and sequence generation, across speech, music, and environmental audio. Evaluation results indicate that none of the models performed well universally. SALMONN-13B excelled in English ASR, while WavLLM demonstrated high accuracy in emotion recognition, but current models still require further innovations to handle a broader range of tasks. We will soon open-source all task data and the evaluation pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'none'; document.getElementById('2411.05361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05027">arXiv:2411.05027</a> <span> [<a href="https://arxiv.org/pdf/2411.05027">pdf</a>, <a href="https://arxiv.org/format/2411.05027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/MGRS.2024.3483459">10.1109/MGRS.2024.3483459 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Generative Artificial Intelligence Meets Synthetic Aperture Radar: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhongling Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xidan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zuqian Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Feng Xu</a>, <a href="/search/cs?searchtype=author&query=Datcu%2C+M">Mihai Datcu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Junwei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05027v1-abstract-short" style="display: inline;"> SAR images possess unique attributes that present challenges for both human observers and vision AI models to interpret, owing to their electromagnetic characteristics. The interpretation of SAR images encounters various hurdles, with one of the primary obstacles being the data itself, which includes issues related to both the quantity and quality of the data. The challenges can be addressed using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05027v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05027v1-abstract-full" style="display: none;"> SAR images possess unique attributes that present challenges for both human observers and vision AI models to interpret, owing to their electromagnetic characteristics. The interpretation of SAR images encounters various hurdles, with one of the primary obstacles being the data itself, which includes issues related to both the quantity and quality of the data. The challenges can be addressed using generative AI technologies. Generative AI, often known as GenAI, is a very advanced and powerful technology in the field of artificial intelligence that has gained significant attention. The advancement has created possibilities for the creation of texts, photorealistic pictures, videos, and material in various modalities. This paper aims to comprehensively investigate the intersection of GenAI and SAR. First, we illustrate the common data generation-based applications in SAR field and compare them with computer vision tasks, analyzing the similarity, difference, and general challenges of them. Then, an overview of the latest GenAI models is systematically reviewed, including various basic models and their variations targeting the general challenges. Additionally, the corresponding applications in SAR domain are also included. Specifically, we propose to summarize the physical model based simulation approaches for SAR, and analyze the hybrid modeling methods that combine the GenAI and interpretable models. The evaluation methods that have been or could be applied to SAR, are also explored. Finally, the potential challenges and future prospects are discussed. To our best knowledge, this survey is the first exhaustive examination of the interdiscipline of SAR and GenAI, encompassing a wide range of topics, including deep neural networks, physical models, computer vision, and SAR images. The resources of this survey are open-source at \url{https://github.com/XAI4SAR/GenAIxSAR}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05027v1-abstract-full').style.display = 'none'; document.getElementById('2411.05027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02028">arXiv:2411.02028</a> <span> [<a href="https://arxiv.org/pdf/2411.02028">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> An Immediate Update Strategy of Multi-State Constraint Kalman Filter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qingchao Zhang</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wei Ouyang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiale Han</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Q">Qi Cai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Maoran Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuanxin Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02028v1-abstract-short" style="display: inline;"> The lightweight Multi-state Constraint Kalman Filter (MSCKF) has been well-known for its high efficiency, in which the delayed update has been usually adopted since its proposal. This work investigates the immediate update strategy of MSCKF based on timely reconstructed 3D feature points and measurement constraints. The differences between the delayed update and the immediate update are theoretica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02028v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02028v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02028v1-abstract-full" style="display: none;"> The lightweight Multi-state Constraint Kalman Filter (MSCKF) has been well-known for its high efficiency, in which the delayed update has been usually adopted since its proposal. This work investigates the immediate update strategy of MSCKF based on timely reconstructed 3D feature points and measurement constraints. The differences between the delayed update and the immediate update are theoretically analyzed in detail. It is found that the immediate update helps construct more observation constraints and employ more filtering updates than the delayed update, which improves the linearization point of the measurement model and therefore enhances the estimation accuracy. Numerical simulations and experiments show that the immediate update strategy significantly enhances MSCKF even with a small amount of feature observations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02028v1-abstract-full').style.display = 'none'; document.getElementById('2411.02028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01236">arXiv:2411.01236</a> <span> [<a href="https://arxiv.org/pdf/2411.01236">pdf</a>, <a href="https://arxiv.org/format/2411.01236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AutoPT: How Far Are We from the End2End Automated Web Penetration Testing? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+B">Benlong Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guoqiang Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kejiang Chen</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+X">Xiuwei Shang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiapeng Han</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yanru He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+N">Nenghai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01236v1-abstract-short" style="display: inline;"> Penetration testing is essential to ensure Web security, which can detect and fix vulnerabilities in advance, and prevent data leakage and serious consequences. The powerful inference capabilities of large language models (LLMs) have made significant progress in various fields, and the development potential of LLM-based agents can revolutionize the cybersecurity penetration testing industry. In th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01236v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01236v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01236v1-abstract-full" style="display: none;"> Penetration testing is essential to ensure Web security, which can detect and fix vulnerabilities in advance, and prevent data leakage and serious consequences. The powerful inference capabilities of large language models (LLMs) have made significant progress in various fields, and the development potential of LLM-based agents can revolutionize the cybersecurity penetration testing industry. In this work, we establish a comprehensive end-to-end penetration testing benchmark using a real-world penetration testing environment to explore the capabilities of LLM-based agents in this domain. Our results reveal that the agents are familiar with the framework of penetration testing tasks, but they still face limitations in generating accurate commands and executing complete processes. Accordingly, we summarize the current challenges, including the difficulty of maintaining the entire message history and the tendency for the agent to become stuck. Based on the above insights, we propose a Penetration testing State Machine (PSM) that utilizes the Finite State Machine (FSM) methodology to address these limitations. Then, we introduce AutoPT, an automated penetration testing agent based on the principle of PSM driven by LLMs, which utilizes the inherent inference ability of LLM and the constraint framework of state machines. Our evaluation results show that AutoPT outperforms the baseline framework ReAct on the GPT-4o mini model and improves the task completion rate from 22% to 41% on the benchmark target. Compared with the baseline framework and manual work, AutoPT also reduces time and economic costs further. Hence, our AutoPT has facilitated the development of automated penetration testing and significantly impacted both academia and industry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01236v1-abstract-full').style.display = 'none'; document.getElementById('2411.01236v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00278">arXiv:2411.00278</a> <span> [<a href="https://arxiv.org/pdf/2411.00278">pdf</a>, <a href="https://arxiv.org/format/2411.00278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> KAN-AD: Time Series Anomaly Detection with Kolmogorov-Arnold Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Quan Zhou</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+C">Changhua Pei</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Fei Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jing Han</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhengwei Gao</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+D">Dan Pei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+G">Gaogang Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianhui Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00278v1-abstract-short" style="display: inline;"> Time series anomaly detection (TSAD) has become an essential component of large-scale cloud services and web systems because it can promptly identify anomalies, providing early warnings to prevent greater losses. Deep learning-based forecasting methods have become very popular in TSAD due to their powerful learning capabilities. However, accurate predictions don't necessarily lead to better anomal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00278v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00278v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00278v1-abstract-full" style="display: none;"> Time series anomaly detection (TSAD) has become an essential component of large-scale cloud services and web systems because it can promptly identify anomalies, providing early warnings to prevent greater losses. Deep learning-based forecasting methods have become very popular in TSAD due to their powerful learning capabilities. However, accurate predictions don't necessarily lead to better anomaly detection. Due to the common occurrence of noise, i.e., local peaks and drops in time series, existing black-box learning methods can easily learn these unintended patterns, significantly affecting anomaly detection performance. Kolmogorov-Arnold Networks (KAN) offers a potential solution by decomposing complex temporal sequences into a combination of multiple univariate functions, making the training process more controllable. However, KAN optimizes univariate functions using spline functions, which are also susceptible to the influence of local anomalies. To address this issue, we present KAN-AD, which leverages the Fourier series to emphasize global temporal patterns, thereby mitigating the influence of local peaks and drops. KAN-AD improves both effectiveness and efficiency by transforming the existing black-box learning approach into learning the weights preceding univariate functions. Experimental results show that, compared to the current state-of-the-art, we achieved an accuracy increase of 15% while boosting inference speed by 55 times. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00278v1-abstract-full').style.display = 'none'; document.getElementById('2411.00278v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23812">arXiv:2410.23812</a> <span> [<a href="https://arxiv.org/pdf/2410.23812">pdf</a>, <a href="https://arxiv.org/format/2410.23812">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Graph Neural Networks Uncover Geometric Neural Representations in Reinforcement-Based Motor Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nardi%2C+F">Federico Nardi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jinpei Han</a>, <a href="/search/cs?searchtype=author&query=Haar%2C+S">Shlomi Haar</a>, <a href="/search/cs?searchtype=author&query=Faisal%2C+A+A">A. Aldo Faisal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23812v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNN) can capture the geometric properties of neural representations in EEG data. Here we utilise those to study how reinforcement-based motor learning affects neural activity patterns during motor planning, leveraging the inherent graph structure of EEG channels to capture the spatial relationships in brain activity. By exploiting task-specific symmetries, we define differen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23812v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23812v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23812v1-abstract-full" style="display: none;"> Graph Neural Networks (GNN) can capture the geometric properties of neural representations in EEG data. Here we utilise those to study how reinforcement-based motor learning affects neural activity patterns during motor planning, leveraging the inherent graph structure of EEG channels to capture the spatial relationships in brain activity. By exploiting task-specific symmetries, we define different pretraining strategies that not only improve model performance across all participant groups but also validate the robustness of the geometric representations. Explainability analysis based on the graph structures reveals consistent group-specific neural signatures that persist across pretraining conditions, suggesting stable geometric structures in the neural representations associated with motor learning and feedback processing. These geometric patterns exhibit partial invariance to certain task space transformations, indicating symmetries that enable generalisation across conditions while maintaining specificity to individual learning strategies. This work demonstrates how GNNs can uncover the effects of previous outcomes on motor planning, in a complex real-world task, providing insights into the geometric principles governing neural representations. Our experimental design bridges the gap between controlled experiments and ecologically valid scenarios, offering new insights into the organisation of neural representations during naturalistic motor learning, which may open avenues for exploring fundamental principles governing brain activity in complex tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23812v1-abstract-full').style.display = 'none'; document.getElementById('2410.23812v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 7 figures, accepted at the NeurIPS 2024 workshop on Symmetry and Geometry in Neural Representations (NeurReps 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22128">arXiv:2410.22128</a> <span> [<a href="https://arxiv.org/pdf/2410.22128">pdf</a>, <a href="https://arxiv.org/format/2410.22128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PF3plat: Pose-Free Feed-Forward 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+S">Sunghwan Hong</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+J">Jaewoo Jung</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+H">Heeseong Shin</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jisang Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaolong Yang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungryong Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22128v1-abstract-short" style="display: inline;"> We consider the problem of novel view synthesis from unposed images in a single feed-forward. Our framework capitalizes on fast speed, scalability, and high-quality 3D reconstruction and view synthesis capabilities of 3DGS, where we further extend it to offer a practical solution that relaxes common assumptions such as dense image views, accurate camera poses, and substantial image overlaps. We ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22128v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22128v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22128v1-abstract-full" style="display: none;"> We consider the problem of novel view synthesis from unposed images in a single feed-forward. Our framework capitalizes on fast speed, scalability, and high-quality 3D reconstruction and view synthesis capabilities of 3DGS, where we further extend it to offer a practical solution that relaxes common assumptions such as dense image views, accurate camera poses, and substantial image overlaps. We achieve this through identifying and addressing unique challenges arising from the use of pixel-aligned 3DGS: misaligned 3D Gaussians across different views induce noisy or sparse gradients that destabilize training and hinder convergence, especially when above assumptions are not met. To mitigate this, we employ pre-trained monocular depth estimation and visual correspondence models to achieve coarse alignments of 3D Gaussians. We then introduce lightweight, learnable modules to refine depth and pose estimates from the coarse alignments, improving the quality of 3D reconstruction and novel view synthesis. Furthermore, the refined estimates are leveraged to estimate geometry confidence scores, which assess the reliability of 3D Gaussian centers and condition the prediction of Gaussian parameters accordingly. Extensive evaluations on large-scale real-world datasets demonstrate that PF3plat sets a new state-of-the-art across all benchmarks, supported by comprehensive ablation studies validating our design choices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22128v1-abstract-full').style.display = 'none'; document.getElementById('2410.22128v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://cvlab-kaist.github.io/PF3plat/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21807">arXiv:2410.21807</a> <span> [<a href="https://arxiv.org/pdf/2410.21807">pdf</a>, <a href="https://arxiv.org/format/2410.21807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Fresh Look at Generalized Category Discovery through Non-negative Matrix Factorization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Zhong Ji</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingren Liu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yanwei Pang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jungong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21807v2-abstract-short" style="display: inline;"> Generalized Category Discovery (GCD) aims to classify both base and novel images using labeled base data. However, current approaches inadequately address the intrinsic optimization of the co-occurrence matrix $\bar{A}$ based on cosine similarity, failing to achieve zero base-novel regions and adequate sparsity in base and novel domains. To address these deficiencies, we propose a Non-Negative Gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21807v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21807v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21807v2-abstract-full" style="display: none;"> Generalized Category Discovery (GCD) aims to classify both base and novel images using labeled base data. However, current approaches inadequately address the intrinsic optimization of the co-occurrence matrix $\bar{A}$ based on cosine similarity, failing to achieve zero base-novel regions and adequate sparsity in base and novel domains. To address these deficiencies, we propose a Non-Negative Generalized Category Discovery (NN-GCD) framework. It employs Symmetric Non-negative Matrix Factorization (SNMF) as a mathematical medium to prove the equivalence of optimal K-means with optimal SNMF, and the equivalence of SNMF solver with non-negative contrastive learning (NCL) optimization. Utilizing these theoretical equivalences, it reframes the optimization of $\bar{A}$ and K-means clustering as an NCL optimization problem. Moreover, to satisfy the non-negative constraints and make a GCD model converge to a near-optimal region, we propose a GELU activation function and an NMF NCE loss. To transition $\bar{A}$ from a suboptimal state to the desired $\bar{A}^*$, we introduce a hybrid sparse regularization approach to impose sparsity constraints. Experimental results show NN-GCD outperforms state-of-the-art methods on GCD benchmarks, achieving an average accuracy of 66.1\% on the Semantic Shift Benchmark, surpassing prior counterparts by 4.7\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21807v2-abstract-full').style.display = 'none'; document.getElementById('2410.21807v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21662">arXiv:2410.21662</a> <span> [<a href="https://arxiv.org/pdf/2410.21662">pdf</a>, <a href="https://arxiv.org/format/2410.21662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> $f$-PO: Generalizing Preference Optimization with $f$-divergence Minimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiaqi Han</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Mingjian Jiang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuxuan Song</a>, <a href="/search/cs?searchtype=author&query=Leskovec%2C+J">Jure Leskovec</a>, <a href="/search/cs?searchtype=author&query=Ermon%2C+S">Stefano Ermon</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minkai Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21662v1-abstract-short" style="display: inline;"> Preference optimization has made significant progress recently, with numerous methods developed to align language models with human preferences. This paper introduces $f$-divergence Preference Optimization ($f$-PO), a novel framework that generalizes and extends existing approaches. $f$-PO minimizes $f$-divergences between the optimized policy and the optimal policy, encompassing a broad family of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21662v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21662v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21662v1-abstract-full" style="display: none;"> Preference optimization has made significant progress recently, with numerous methods developed to align language models with human preferences. This paper introduces $f$-divergence Preference Optimization ($f$-PO), a novel framework that generalizes and extends existing approaches. $f$-PO minimizes $f$-divergences between the optimized policy and the optimal policy, encompassing a broad family of alignment methods using various divergences. Our approach unifies previous algorithms like DPO and EXO, while offering new variants through different choices of $f$-divergences. We provide theoretical analysis of $f$-PO's properties and conduct extensive experiments on state-of-the-art language models using benchmark datasets. Results demonstrate $f$-PO's effectiveness across various tasks, achieving superior performance compared to existing methods on popular benchmarks such as AlpacaEval 2, Arena-Hard, and MT-Bench. Additionally, we present ablation studies exploring the impact of different $f$-divergences, offering insights into the trade-offs between regularization and performance in offline preference optimization. Our work contributes both practical algorithms and theoretical understanding to the field of language model alignment. Code is available at https://github.com/MinkaiXu/fPO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21662v1-abstract-full').style.display = 'none'; document.getElementById('2410.21662v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20357">arXiv:2410.20357</a> <span> [<a href="https://arxiv.org/pdf/2410.20357">pdf</a>, <a href="https://arxiv.org/format/2410.20357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dynamics as Prompts: In-Context Learning for Sim-to-Real System Identifications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xilun Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shiqi Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Peide Huang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W+J">William Jongwon Han</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yiqi Lyu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengdi Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Ding Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20357v1-abstract-short" style="display: inline;"> Sim-to-real transfer remains a significant challenge in robotics due to the discrepancies between simulated and real-world dynamics. Traditional methods like Domain Randomization often fail to capture fine-grained dynamics, limiting their effectiveness for precise control tasks. In this work, we propose a novel approach that dynamically adjusts simulation environment parameters online using in-con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20357v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20357v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20357v1-abstract-full" style="display: none;"> Sim-to-real transfer remains a significant challenge in robotics due to the discrepancies between simulated and real-world dynamics. Traditional methods like Domain Randomization often fail to capture fine-grained dynamics, limiting their effectiveness for precise control tasks. In this work, we propose a novel approach that dynamically adjusts simulation environment parameters online using in-context learning. By leveraging past interaction histories as context, our method adapts the simulation environment dynamics to real-world dynamics without requiring gradient updates, resulting in faster and more accurate alignment between simulated and real-world performance. We validate our approach across two tasks: object scooping and table air hockey. In the sim-to-sim evaluations, our method significantly outperforms the baselines on environment parameter estimation by 80% and 42% in the object scooping and table air hockey setups, respectively. Furthermore, our method achieves at least 70% success rate in sim-to-real transfer on object scooping across three different objects. By incorporating historical interaction data, our approach delivers efficient and smooth system identification, advancing the deployment of robots in dynamic real-world scenarios. Demos are available on our project page: https://sim2real-capture.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20357v1-abstract-full').style.display = 'none'; document.getElementById('2410.20357v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">website: https://sim2real-capture.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19542">arXiv:2410.19542</a> <span> [<a href="https://arxiv.org/pdf/2410.19542">pdf</a>, <a href="https://arxiv.org/format/2410.19542">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Brain-like Functional Organization within Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haiyang Sun</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lin Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaohui Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yutao Hu</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+M">Mengfei Zuo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Junwei Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xintao Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19542v2-abstract-short" style="display: inline;"> The human brain has long inspired the pursuit of artificial intelligence (AI). Recently, neuroimaging studies provide compelling evidence of alignment between the computational representation of artificial neural networks (ANNs) and the neural responses of the human brain to stimuli, suggesting that ANNs may employ brain-like information processing strategies. While such alignment has been observe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19542v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19542v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19542v2-abstract-full" style="display: none;"> The human brain has long inspired the pursuit of artificial intelligence (AI). Recently, neuroimaging studies provide compelling evidence of alignment between the computational representation of artificial neural networks (ANNs) and the neural responses of the human brain to stimuli, suggesting that ANNs may employ brain-like information processing strategies. While such alignment has been observed across sensory modalities--visual, auditory, and linguistic--much of the focus has been on the behaviors of artificial neurons (ANs) at the population level, leaving the functional organization of individual ANs that facilitates such brain-like processes largely unexplored. In this study, we bridge this gap by directly coupling sub-groups of artificial neurons with functional brain networks (FBNs), the foundational organizational structure of the human brain. Specifically, we extract representative patterns from temporal responses of ANs in large language models (LLMs), and use them as fixed regressors to construct voxel-wise encoding models to predict brain activity recorded by functional magnetic resonance imaging (fMRI). This framework links the AN sub-groups to FBNs, enabling the delineation of brain-like functional organization within LLMs. Our findings reveal that LLMs (BERT and Llama 1-3) exhibit brain-like functional architecture, with sub-groups of artificial neurons mirroring the organizational patterns of well-established FBNs. Notably, the brain-like functional organization of LLMs evolves with the increased sophistication and capability, achieving an improved balance between the diversity of computational behaviors and the consistency of functional specializations. This research represents the first exploration of brain-like functional organization within LLMs, offering novel insights to inform the development of artificial general intelligence (AGI) with human brain principles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19542v2-abstract-full').style.display = 'none'; document.getElementById('2410.19542v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19218">arXiv:2410.19218</a> <span> [<a href="https://arxiv.org/pdf/2410.19218">pdf</a>, <a href="https://arxiv.org/format/2410.19218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Taxonomy-guided Semantic Indexing for Academic Paper Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+S">SeongKu Kang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yunyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+P">Pengcheng Jiang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dongha Lee</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hwanjo Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19218v1-abstract-short" style="display: inline;"> Academic paper search is an essential task for efficient literature discovery and scientific advancement. While dense retrieval has advanced various ad-hoc searches, it often struggles to match the underlying academic concepts between queries and documents, which is critical for paper search. To enable effective academic concept matching for paper search, we propose Taxonomy-guided Semantic Indexi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19218v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19218v1-abstract-full" style="display: none;"> Academic paper search is an essential task for efficient literature discovery and scientific advancement. While dense retrieval has advanced various ad-hoc searches, it often struggles to match the underlying academic concepts between queries and documents, which is critical for paper search. To enable effective academic concept matching for paper search, we propose Taxonomy-guided Semantic Indexing (TaxoIndex) framework. TaxoIndex extracts key concepts from papers and organizes them as a semantic index guided by an academic taxonomy, and then leverages this index as foundational knowledge to identify academic concepts and link queries and documents. As a plug-and-play framework, TaxoIndex can be flexibly employed to enhance existing dense retrievers. Extensive experiments show that TaxoIndex brings significant improvements, even with highly limited training data, and greatly enhances interpretability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19218v1-abstract-full').style.display = 'none'; document.getElementById('2410.19218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP'24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18935">arXiv:2410.18935</a> <span> [<a href="https://arxiv.org/pdf/2410.18935">pdf</a>, <a href="https://arxiv.org/format/2410.18935">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Schema-Guided Culture-Aware Complex Event Simulation with Multi-Agent Role-Play </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Sha Li</a>, <a href="/search/cs?searchtype=author&query=Reddy%2C+R+G">Revanth Gangi Reddy</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+K+D">Khanh Duy Nguyen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qingyun Wang</a>, <a href="/search/cs?searchtype=author&query=Fung%2C+M">May Fung</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Chi Han</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a>, <a href="/search/cs?searchtype=author&query=Natarajan%2C+K">Kartik Natarajan</a>, <a href="/search/cs?searchtype=author&query=Voss%2C+C+R">Clare R. Voss</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+H">Heng Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18935v1-abstract-short" style="display: inline;"> Complex news events, such as natural disasters and socio-political conflicts, require swift responses from the government and society. Relying on historical events to project the future is insufficient as such events are sparse and do not cover all possible conditions and nuanced situations. Simulation of these complex events can help better prepare and reduce the negative impact. We develop a con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18935v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18935v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18935v1-abstract-full" style="display: none;"> Complex news events, such as natural disasters and socio-political conflicts, require swift responses from the government and society. Relying on historical events to project the future is insufficient as such events are sparse and do not cover all possible conditions and nuanced situations. Simulation of these complex events can help better prepare and reduce the negative impact. We develop a controllable complex news event simulator guided by both the event schema representing domain knowledge about the scenario and user-provided assumptions representing case-specific conditions. As event dynamics depend on the fine-grained social and cultural context, we further introduce a geo-diverse commonsense and cultural norm-aware knowledge enhancement component. To enhance the coherence of the simulation, apart from the global timeline of events, we take an agent-based approach to simulate the individual character states, plans, and actions. By incorporating the schema and cultural norms, our generated simulations achieve much higher coherence and appropriateness and are received favorably by participants from a humanitarian assistance organization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18935v1-abstract-full').style.display = 'none'; document.getElementById('2410.18935v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as EMNLP 2024 Demo</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18111">arXiv:2410.18111</a> <span> [<a href="https://arxiv.org/pdf/2410.18111">pdf</a>, <a href="https://arxiv.org/format/2410.18111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data Efficiency for Large Recommendation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jain%2C+K">Kshitij Jain</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jingru Xie</a>, <a href="/search/cs?searchtype=author&query=Regan%2C+K">Kevin Regan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Cheng Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jie Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Steve Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoshu Li</a>, <a href="/search/cs?searchtype=author&query=Phillips%2C+T">Todd Phillips</a>, <a href="/search/cs?searchtype=author&query=Sussman%2C+M">Myles Sussman</a>, <a href="/search/cs?searchtype=author&query=Troup%2C+M">Matt Troup</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+A">Angel Yu</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+J">Jia Zhuo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18111v2-abstract-short" style="display: inline;"> Large recommendation models (LRMs) are fundamental to the multi-billion dollar online advertising industry, processing massive datasets of hundreds of billions of examples before transitioning to continuous online training to adapt to rapidly changing user behavior. The massive scale of data directly impacts both computational costs and the speed at which new methods can be evaluated (R&D velocity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18111v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18111v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18111v2-abstract-full" style="display: none;"> Large recommendation models (LRMs) are fundamental to the multi-billion dollar online advertising industry, processing massive datasets of hundreds of billions of examples before transitioning to continuous online training to adapt to rapidly changing user behavior. The massive scale of data directly impacts both computational costs and the speed at which new methods can be evaluated (R&D velocity). This paper presents actionable principles and high-level frameworks to guide practitioners in optimizing training data requirements. These strategies have been successfully deployed in Google's largest Ads CTR prediction models and are broadly applicable beyond LRMs. We outline the concept of data convergence, describe methods to accelerate this convergence, and finally, detail how to optimally balance training data volume with model size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18111v2-abstract-full').style.display = 'none'; document.getElementById('2410.18111v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17891">arXiv:2410.17891</a> <span> [<a href="https://arxiv.org/pdf/2410.17891">pdf</a>, <a href="https://arxiv.org/format/2410.17891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Scaling Diffusion Language Models via Adaptation from Autoregressive Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+S">Shansan Gong</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+S">Shivam Agarwal</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yizhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jiacheng Ye</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Lin Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mukai Li</a>, <a href="/search/cs?searchtype=author&query=An%2C+C">Chenxin An</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peilin Zhao</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+W">Wei Bi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingpeng Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17891v1-abstract-short" style="display: inline;"> Diffusion Language Models (DLMs) have emerged as a promising new paradigm for text generative modeling, potentially addressing limitations of autoregressive (AR) models. However, current DLMs have been studied at a smaller scale compared to their AR counterparts and lack fair comparison on language modeling benchmarks. Additionally, training diffusion models from scratch at scale remains challengi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17891v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17891v1-abstract-full" style="display: none;"> Diffusion Language Models (DLMs) have emerged as a promising new paradigm for text generative modeling, potentially addressing limitations of autoregressive (AR) models. However, current DLMs have been studied at a smaller scale compared to their AR counterparts and lack fair comparison on language modeling benchmarks. Additionally, training diffusion models from scratch at scale remains challenging. Given the prevalence of open-source AR language models, we propose adapting these models to build text diffusion models. We demonstrate connections between AR and diffusion modeling objectives and introduce a simple continual pre-training approach for training diffusion models. Through systematic evaluation on language modeling, reasoning, and commonsense benchmarks, we show that we can convert AR models ranging from 127M to 7B parameters (GPT2 and LLaMA) into diffusion models DiffuGPT and DiffuLLaMA, using less than 200B tokens for training. Our experimental results reveal that these models outperform earlier DLMs and are competitive with their AR counterparts. We release a suite of DLMs (with 127M, 355M, and 7B parameters) capable of generating fluent text, performing in-context learning, filling in the middle without prompt re-ordering, and following instructions \url{https://github.com/HKUNLP/DiffuLLaMA}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17891v1-abstract-full').style.display = 'none'; document.getElementById('2410.17891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages. Code: https://github.com/HKUNLP/DiffuLLaMA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15252">arXiv:2410.15252</a> <span> [<a href="https://arxiv.org/pdf/2410.15252">pdf</a>, <a href="https://arxiv.org/format/2410.15252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Lossless KV Cache Compression to 2% </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J+N">J. N. Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">An Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Z">Zhanhui Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15252v1-abstract-short" style="display: inline;"> Large language models have revolutionized data processing in numerous domains, with their ability to handle extended context reasoning receiving notable recognition. To speed up inference, maintaining a key-value (KV) cache memory is essential. Nonetheless, the growing demands for KV cache memory create significant hurdles for efficient implementation. This work introduces a novel architecture, Cr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15252v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15252v1-abstract-full" style="display: none;"> Large language models have revolutionized data processing in numerous domains, with their ability to handle extended context reasoning receiving notable recognition. To speed up inference, maintaining a key-value (KV) cache memory is essential. Nonetheless, the growing demands for KV cache memory create significant hurdles for efficient implementation. This work introduces a novel architecture, Cross-Layer Latent Attention (CLLA), aimed at compressing the KV cache to less than 2% of its original size while maintaining comparable performance levels. CLLA integrates multiple aspects of KV cache compression, including attention head/dimension reduction, layer sharing, and quantization techniques, into a cohesive framework. Our extensive experiments demonstrate that CLLA achieves lossless performance on most tasks while utilizing minimal KV cache, marking a significant advancement in practical KV cache compression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15252v1-abstract-full').style.display = 'none'; document.getElementById('2410.15252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15025">arXiv:2410.15025</a> <span> [<a href="https://arxiv.org/pdf/2410.15025">pdf</a>, <a href="https://arxiv.org/format/2410.15025">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLM-Driven Learning Analytics Dashboard for Teachers in EFL Writing Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minsun Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">SeonGyeom Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Suyoun Lee</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+Y">Yoosang Yoon</a>, <a href="/search/cs?searchtype=author&query=Myung%2C+J">Junho Myung</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+H">Haneul Yoo</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+H">Hyunseung Lim</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jieun Han</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yoonsu Kim</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+S">So-Yeon Ahn</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Juho Kim</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+A">Alice Oh</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+H">Hwajung Hong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+T+Y">Tak Yeon Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15025v1-abstract-short" style="display: inline;"> This paper presents the development of a dashboard designed specifically for teachers in English as a Foreign Language (EFL) writing education. Leveraging LLMs, the dashboard facilitates the analysis of student interactions with an essay writing system, which integrates ChatGPT for real-time feedback. The dashboard aids teachers in monitoring student behavior, identifying noneducational interactio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15025v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15025v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15025v1-abstract-full" style="display: none;"> This paper presents the development of a dashboard designed specifically for teachers in English as a Foreign Language (EFL) writing education. Leveraging LLMs, the dashboard facilitates the analysis of student interactions with an essay writing system, which integrates ChatGPT for real-time feedback. The dashboard aids teachers in monitoring student behavior, identifying noneducational interaction with ChatGPT, and aligning instructional strategies with learning objectives. By combining insights from NLP and Human-Computer Interaction (HCI), this study demonstrates how a human-centered approach can enhance the effectiveness of teacher dashboards, particularly in ChatGPT-integrated learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15025v1-abstract-full').style.display = 'none'; document.getElementById('2410.15025v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Workshop CustomNLP4U. arXiv admin note: text overlap with arXiv:2405.19691</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14944">arXiv:2410.14944</a> <span> [<a href="https://arxiv.org/pdf/2410.14944">pdf</a>, <a href="https://arxiv.org/format/2410.14944">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Part-Whole Relational Fusion Towards Multi-Modal Scene Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengxin Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shoukun Xu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jungong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14944v1-abstract-short" style="display: inline;"> Multi-modal fusion has played a vital role in multi-modal scene understanding. Most existing methods focus on cross-modal fusion involving two modalities, often overlooking more complex multi-modal fusion, which is essential for real-world applications like autonomous driving, where visible, depth, event, LiDAR, etc., are used. Besides, few attempts for multi-modal fusion, \emph{e.g.}, simple conc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14944v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14944v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14944v1-abstract-full" style="display: none;"> Multi-modal fusion has played a vital role in multi-modal scene understanding. Most existing methods focus on cross-modal fusion involving two modalities, often overlooking more complex multi-modal fusion, which is essential for real-world applications like autonomous driving, where visible, depth, event, LiDAR, etc., are used. Besides, few attempts for multi-modal fusion, \emph{e.g.}, simple concatenation, cross-modal attention, and token selection, cannot well dig into the intrinsic shared and specific details of multiple modalities. To tackle the challenge, in this paper, we propose a Part-Whole Relational Fusion (PWRF) framework. For the first time, this framework treats multi-modal fusion as part-whole relational fusion. It routes multiple individual part-level modalities to a fused whole-level modality using the part-whole relational routing ability of Capsule Networks (CapsNets). Through this part-whole routing, our PWRF generates modal-shared and modal-specific semantics from the whole-level modal capsules and the routing coefficients, respectively. On top of that, modal-shared and modal-specific details can be employed to solve the issue of multi-modal scene understanding, including synthetic multi-modal segmentation and visible-depth-thermal salient object detection in this paper. Experiments on several datasets demonstrate the superiority of the proposed PWRF framework for multi-modal scene understanding. The source code has been released on https://github.com/liuyi1989/PWRF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14944v1-abstract-full').style.display = 'none'; document.getElementById('2410.14944v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14684">arXiv:2410.14684</a> <span> [<a href="https://arxiv.org/pdf/2410.14684">pdf</a>, <a href="https://arxiv.org/format/2410.14684">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RepoGraph: Enhancing AI Software Engineering with Repository-level Code Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ouyang%2C+S">Siru Ouyang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Kaixin Ma</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zilin Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+M">Mengzhao Jia</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongming Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14684v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) excel in code generation yet struggle with modern AI software engineering tasks. Unlike traditional function-level or file-level coding tasks, AI software engineering requires not only basic coding proficiency but also advanced skills in managing and interacting with code repositories. However, existing methods often overlook the need for repository-level code understa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14684v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14684v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14684v1-abstract-full" style="display: none;"> Large Language Models (LLMs) excel in code generation yet struggle with modern AI software engineering tasks. Unlike traditional function-level or file-level coding tasks, AI software engineering requires not only basic coding proficiency but also advanced skills in managing and interacting with code repositories. However, existing methods often overlook the need for repository-level code understanding, which is crucial for accurately grasping the broader context and developing effective solutions. On this basis, we present RepoGraph, a plug-in module that manages a repository-level structure for modern AI software engineering solutions. RepoGraph offers the desired guidance and serves as a repository-wide navigation for AI software engineers. We evaluate RepoGraph on the SWE-bench by plugging it into four different methods of two lines of approaches, where RepoGraph substantially boosts the performance of all systems, leading to a new state-of-the-art among open-source frameworks. Our analyses also demonstrate the extensibility and flexibility of RepoGraph by testing on another repo-level coding benchmark, CrossCodeEval. Our code is available at https://github.com/ozyyshr/RepoGraph. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14684v1-abstract-full').style.display = 'none'; document.getElementById('2410.14684v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13874">arXiv:2410.13874</a> <span> [<a href="https://arxiv.org/pdf/2410.13874">pdf</a>, <a href="https://arxiv.org/format/2410.13874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> COOL: Efficient and Reliable Chain-Oriented Objective Logic with Neural Networks Feedback Control for Program Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J">Jipeng Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13874v2-abstract-short" style="display: inline;"> Program synthesis methods, whether formal or neural-based, lack fine-grained control and flexible modularity, which limits their adaptation to complex software development. These limitations stem from rigid Domain-Specific Language (DSL) frameworks and neural network incorrect predictions. To this end, we propose the Chain of Logic (CoL), which organizes synthesis stages into a chain and provides… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13874v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13874v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13874v2-abstract-full" style="display: none;"> Program synthesis methods, whether formal or neural-based, lack fine-grained control and flexible modularity, which limits their adaptation to complex software development. These limitations stem from rigid Domain-Specific Language (DSL) frameworks and neural network incorrect predictions. To this end, we propose the Chain of Logic (CoL), which organizes synthesis stages into a chain and provides precise heuristic control to guide the synthesis process. Furthermore, by integrating neural networks with libraries and introducing a Neural Network Feedback Control (NNFC) mechanism, our approach modularizes synthesis and mitigates the impact of neural network mispredictions. Experiments on relational and symbolic synthesis tasks show that CoL significantly enhances the efficiency and reliability of DSL program synthesis across multiple metrics. Specifically, CoL improves accuracy by 70% while reducing tree operations by 91% and time by 95%. Additionally, NNFC further boosts accuracy by 6%, with a 64% reduction in tree operations under challenging conditions such as insufficient training data, increased difficulty, and multidomain synthesis. These improvements confirm COOL as a highly efficient and reliable program synthesis framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13874v2-abstract-full').style.display = 'none'; document.getElementById('2410.13874v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13413">arXiv:2410.13413</a> <span> [<a href="https://arxiv.org/pdf/2410.13413">pdf</a>, <a href="https://arxiv.org/format/2410.13413">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Think Thrice Before You Act: Progressive Thought Refinement in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+C">Chengyu Du</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jinyi Han</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+Y">Yizhou Ying</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Aili Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qianyu He</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haokun Zhao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Sirui Xia</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haoran Guo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jiaqing Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zulong Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liangyue Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yanghua Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13413v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have demonstrated that progressive refinement, rather than providing a single answer, results in more accurate and thoughtful outputs. However, existing methods often rely heavily on supervision signals to evaluate previous responses, making it difficult to assess output quality in more open-ended scenarios effectively. Additionally, these method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13413v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13413v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13413v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have demonstrated that progressive refinement, rather than providing a single answer, results in more accurate and thoughtful outputs. However, existing methods often rely heavily on supervision signals to evaluate previous responses, making it difficult to assess output quality in more open-ended scenarios effectively. Additionally, these methods are typically designed for specific tasks, which limits their generalization to new domains. To address these limitations, we propose Progressive Thought Refinement (PTR), a framework that enables LLMs to refine their responses progressively. PTR operates in two phases: (1) Thought data construction stage: We propose a weak and strong model collaborative selection strategy to build a high-quality progressive refinement dataset to ensure logical consistency from thought to answers, and the answers are gradually refined in each round. (2) Thought-Mask Fine-Tuning Phase: We design a training structure to mask the "thought" and adjust loss weights to encourage LLMs to refine prior thought, teaching them to implicitly understand "how to improve" rather than "what is correct." Experimental results show that PTR significantly enhances LLM performance across ten diverse tasks (avg. from 49.6% to 53.5%) without task-specific fine-tuning. Notably, in more open-ended tasks, LLMs also demonstrate substantial improvements in the quality of responses beyond mere accuracy, suggesting that PTR truly teaches LLMs to self-improve over time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13413v1-abstract-full').style.display = 'none'; document.getElementById('2410.13413v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13360">arXiv:2410.13360</a> <span> [<a href="https://arxiv.org/pdf/2410.13360">pdf</a>, <a href="https://arxiv.org/format/2410.13360">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Retrieval-Augmented Personalization for Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+H">Haoran Hao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiaming Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changsheng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu-Feng Li</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiangyu Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13360v2-abstract-short" style="display: inline;"> The development of large language models (LLMs) has significantly enhanced the capabilities of multimodal LLMs (MLLMs) as general assistants. However, lack of user-specific knowledge still restricts their application in human's daily life. In this paper, we introduce the Retrieval Augmented Personalization (RAP) framework for MLLMs' personalization. Starting from a general MLLM, we turn it into a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13360v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13360v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13360v2-abstract-full" style="display: none;"> The development of large language models (LLMs) has significantly enhanced the capabilities of multimodal LLMs (MLLMs) as general assistants. However, lack of user-specific knowledge still restricts their application in human's daily life. In this paper, we introduce the Retrieval Augmented Personalization (RAP) framework for MLLMs' personalization. Starting from a general MLLM, we turn it into a personalized assistant in three steps. (a) Remember: We design a key-value database to store user-related information, e.g., user's name, avatar and other attributes. (b) Retrieve: When the user initiates a conversation, RAP will retrieve relevant information from the database using a multimodal retriever. (c) Generate: The input query and retrieved concepts' information are fed into MLLMs to generate personalized, knowledge-augmented responses. Unlike previous methods, RAP allows real-time concept editing via updating the external database. To further improve generation quality and alignment with user-specific information, we design a pipeline for data collection and create a specialized dataset for personalized training of MLLMs. Based on the dataset, we train a series of MLLMs as personalized multimodal assistants. By pretraining on large-scale dataset, RAP-MLLMs can generalize to infinite visual concepts without additional finetuning. Our models demonstrate outstanding flexibility and generation quality across a variety of tasks, such as personalized image captioning, question answering and visual recognition. The code, data and models are available at https://github.com/Hoar012/RAP-MLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13360v2-abstract-full').style.display = 'none'; document.getElementById('2410.13360v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13027">arXiv:2410.13027</a> <span> [<a href="https://arxiv.org/pdf/2410.13027">pdf</a>, <a href="https://arxiv.org/format/2410.13027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Geometric Trajectory Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiaqi Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minkai Xu</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+A">Aaron Lou</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Haotian Ye</a>, <a href="/search/cs?searchtype=author&query=Ermon%2C+S">Stefano Ermon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13027v1-abstract-short" style="display: inline;"> Generative models have shown great promise in generating 3D geometric systems, which is a fundamental problem in many natural science domains such as molecule and protein design. However, existing approaches only operate on static structures, neglecting the fact that physical systems are always dynamic in nature. In this work, we propose geometric trajectory diffusion models (GeoTDM), the first di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13027v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13027v1-abstract-full" style="display: none;"> Generative models have shown great promise in generating 3D geometric systems, which is a fundamental problem in many natural science domains such as molecule and protein design. However, existing approaches only operate on static structures, neglecting the fact that physical systems are always dynamic in nature. In this work, we propose geometric trajectory diffusion models (GeoTDM), the first diffusion model for modeling the temporal distribution of 3D geometric trajectories. Modeling such distribution is challenging as it requires capturing both the complex spatial interactions with physical symmetries and temporal correspondence encapsulated in the dynamics. We theoretically justify that diffusion models with equivariant temporal kernels can lead to density with desired symmetry, and develop a novel transition kernel leveraging SE(3)-equivariant spatial convolution and temporal attention. Furthermore, to induce an expressive trajectory distribution for conditional generation, we introduce a generalized learnable geometric prior into the forward diffusion process to enhance temporal conditioning. We conduct extensive experiments on both unconditional and conditional generation in various scenarios, including physical simulation, molecular dynamics, and pedestrian motion. Empirical results on a wide suite of metrics demonstrate that GeoTDM can generate realistic geometric trajectories with significantly higher quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13027v1-abstract-full').style.display = 'none'; document.getElementById('2410.13027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at NeurIPS 2024. 29 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12990">arXiv:2410.12990</a> <span> [<a href="https://arxiv.org/pdf/2410.12990">pdf</a>, <a href="https://arxiv.org/format/2410.12990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Low-Power Encoding for PAM-3 DRAM Bus </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nam%2C+J">Jonghyeon Nam</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jaeduk Han</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hokeun Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12990v1-abstract-short" style="display: inline;"> The 3-level pulse amplitude modulation (PAM-3) signaling is expected to be widely used in memory interfaces for its greater voltage margins compared to PAM-4. To maximize the benefit of PAM-3, we propose three low-power data encoding algorithms: PAM3-DBI, PAM3-MF, and PAM3-SORT. With the DRAM memory traces from the gem5 computer architecture simulator running benchmarks, we evaluate the energy eff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12990v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12990v1-abstract-full" style="display: none;"> The 3-level pulse amplitude modulation (PAM-3) signaling is expected to be widely used in memory interfaces for its greater voltage margins compared to PAM-4. To maximize the benefit of PAM-3, we propose three low-power data encoding algorithms: PAM3-DBI, PAM3-MF, and PAM3-SORT. With the DRAM memory traces from the gem5 computer architecture simulator running benchmarks, we evaluate the energy efficiency of our three PAM-3 encoding techniques. The experimental results show the proposed algorithms can reduce termination power for high-speed memory links significantly by 41% to 90% for benchmark programs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12990v1-abstract-full').style.display = 'none'; document.getElementById('2410.12990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in Proceedings of the 20th International Conference on Synthesis, Modeling, Analysis and Simulation Methods, and Applications to Circuit Design (SMACD 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11924">arXiv:2410.11924</a> <span> [<a href="https://arxiv.org/pdf/2410.11924">pdf</a>, <a href="https://arxiv.org/format/2410.11924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Prompt-Guided Spatio-Temporal Transformer Model for National-Wide Nuclear Radiation Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+T">Tengfei Lyu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jindong Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11924v1-abstract-short" style="display: inline;"> Nuclear radiation (NR), which refers to the energy emitted from atomic nuclei during decay, poses substantial risks to human health and environmental safety. Accurate forecasting of nuclear radiation levels is crucial for informed decision-making by both individuals and governments. However, this task is challenging due to the imbalanced distribution of monitoring stations over a wide spatial rang… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11924v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11924v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11924v1-abstract-full" style="display: none;"> Nuclear radiation (NR), which refers to the energy emitted from atomic nuclei during decay, poses substantial risks to human health and environmental safety. Accurate forecasting of nuclear radiation levels is crucial for informed decision-making by both individuals and governments. However, this task is challenging due to the imbalanced distribution of monitoring stations over a wide spatial range and the non-stationary radiation variation patterns. In this study, we introduce NRFormer, an innovative framework tailored for national-wide prediction of nuclear radiation variations. By integrating a non-stationary temporal attention module, an imbalance-aware spatial attention module, and a radiation propagation prompting module, NRFormer collectively captures complex spatio-temporal dynamics of nuclear radiation. Extensive experiments on two real-world datasets demonstrate the superiority of our proposed framework against seven baselines. This research not only enhances the accuracy and reliability in nuclear radiation forecasting but also contributes to advancing emergency response strategies and monitoring systems, thereby safeguarding environmental and public health. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11924v1-abstract-full').style.display = 'none'; document.getElementById('2410.11924v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10646">arXiv:2410.10646</a> <span> [<a href="https://arxiv.org/pdf/2410.10646">pdf</a>, <a href="https://arxiv.org/format/2410.10646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DR-MPC: Deep Residual Model Predictive Control for Real-world Social Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J+R">James R. Han</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+H">Hugues Thomas</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Rhinehart%2C+N">Nicholas Rhinehart</a>, <a href="/search/cs?searchtype=author&query=Barfoot%2C+T+D">Timothy D. Barfoot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10646v1-abstract-short" style="display: inline;"> How can a robot safely navigate around people exhibiting complex motion patterns? Reinforcement Learning (RL) or Deep RL (DRL) in simulation holds some promise, although much prior work relies on simulators that fail to precisely capture the nuances of real human motion. To address this gap, we propose Deep Residual Model Predictive Control (DR-MPC), a method to enable robots to quickly and safely… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10646v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10646v1-abstract-full" style="display: none;"> How can a robot safely navigate around people exhibiting complex motion patterns? Reinforcement Learning (RL) or Deep RL (DRL) in simulation holds some promise, although much prior work relies on simulators that fail to precisely capture the nuances of real human motion. To address this gap, we propose Deep Residual Model Predictive Control (DR-MPC), a method to enable robots to quickly and safely perform DRL from real-world crowd navigation data. By blending MPC with model-free DRL, DR-MPC overcomes the traditional DRL challenges of large data requirements and unsafe initial behavior. DR-MPC is initialized with MPC-based path tracking, and gradually learns to interact more effectively with humans. To further accelerate learning, a safety component estimates when the robot encounters out-of-distribution states and guides it away from likely collisions. In simulation, we show that DR-MPC substantially outperforms prior work, including traditional DRL and residual DRL models. Real-world experiments show our approach successfully enables a robot to navigate a variety of crowded situations with few errors using less than 4 hours of training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10646v1-abstract-full').style.display = 'none'; document.getElementById('2410.10646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures, under review for IEEE Robotics and Automation Letters (RA-L)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10335">arXiv:2410.10335</a> <span> [<a href="https://arxiv.org/pdf/2410.10335">pdf</a>, <a href="https://arxiv.org/ps/2410.10335">ps</a>, <a href="https://arxiv.org/format/2410.10335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Performance of a Threshold-based WDM and ACM for FSO Communication between Mobile Platforms in Maritime Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J">Jae-Eun Han</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+S+S">Sung Sik Nam</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+D+D">Duck Dong Hwang</a>, <a href="/search/cs?searchtype=author&query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10335v1-abstract-short" style="display: inline;"> In this study, we statistically analyze the performance of a threshold-based multiple optical signal selection scheme (TMOS) for wavelength division multiplexing (WDM) and adaptive coded modulation (ACM) using free space optical (FSO) communication between mobile platforms in maritime environments with fog and 3D pointing errors. Specifically, we derive a new closed-form expression for a composite… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10335v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10335v1-abstract-full" style="display: none;"> In this study, we statistically analyze the performance of a threshold-based multiple optical signal selection scheme (TMOS) for wavelength division multiplexing (WDM) and adaptive coded modulation (ACM) using free space optical (FSO) communication between mobile platforms in maritime environments with fog and 3D pointing errors. Specifically, we derive a new closed-form expression for a composite probability density function (PDF) that is more appropriate for applying various algorithms to FSO systems under the combined effects of fog and pointing errors. We then analyze the outage probability, average spectral efficiency (ASE), and bit error rate (BER) performance of the conventional detection techniques (i.e., heterodyne and intensity modulation/direct detection). The derived analytical results were cross-verified using Monte Carlo simulations. The results show that we can obtain a higher ASE performance by applying TMOS-based WDM and ACM and that the probability of the beam being detected in the photodetector increased at a low signal-to-noise ratio, contrary to conventional performance. Furthermore, it has been confirmed that applying WDM and ACM is suitable, particularly in maritime environments where channel conditions frequently change. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10335v1-abstract-full').style.display = 'none'; document.getElementById('2410.10335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10141">arXiv:2410.10141</a> <span> [<a href="https://arxiv.org/pdf/2410.10141">pdf</a>, <a href="https://arxiv.org/format/2410.10141">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Temperature-Centric Investigation of Speculative Decoding with Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ouyang%2C+S">Siru Ouyang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuohang Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Minhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+M">Ming Zhong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Donghan Yu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yelong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10141v1-abstract-short" style="display: inline;"> Speculative decoding stands as a pivotal technique to expedite inference in autoregressive (large) language models. This method employs a smaller draft model to speculate a block of tokens, which the target model then evaluates for acceptance. Despite a wealth of studies aimed at increasing the efficiency of speculative decoding, the influence of generation configurations on the decoding process r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10141v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10141v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10141v1-abstract-full" style="display: none;"> Speculative decoding stands as a pivotal technique to expedite inference in autoregressive (large) language models. This method employs a smaller draft model to speculate a block of tokens, which the target model then evaluates for acceptance. Despite a wealth of studies aimed at increasing the efficiency of speculative decoding, the influence of generation configurations on the decoding process remains poorly understood, especially concerning decoding temperatures. This paper delves into the effects of decoding temperatures on speculative decoding's efficacy. Beginning with knowledge distillation (KD), we first highlight the challenge of decoding at higher temperatures, and demonstrate KD in a consistent temperature setting could be a remedy. We also investigate the effects of out-of-domain testing sets with out-of-range temperatures. Building upon these findings, we take an initial step to further the speedup for speculative decoding, particularly in a high-temperature generation setting. Our work offers new insights into how generation configurations drastically affect the performance of speculative decoding, and underscores the need for developing methods that focus on diverse decoding configurations. Code is publically available at https://github.com/ozyyshr/TempSpec. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10141v1-abstract-full').style.display = 'none'; document.getElementById('2410.10141v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09009">arXiv:2410.09009</a> <span> [<a href="https://arxiv.org/pdf/2410.09009">pdf</a>, <a href="https://arxiv.org/format/2410.09009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semantic Score Distillation Sampling for Compositional Text-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+L">Ling Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Junlin Han</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+B">Bohan Zeng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Runjia Li</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09009v1-abstract-short" style="display: inline;"> Generating high-quality 3D assets from textual descriptions remains a pivotal challenge in computer graphics and vision research. Due to the scarcity of 3D data, state-of-the-art approaches utilize pre-trained 2D diffusion priors, optimized through Score Distillation Sampling (SDS). Despite progress, crafting complex 3D scenes featuring multiple objects or intricate interactions is still difficult… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09009v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09009v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09009v1-abstract-full" style="display: none;"> Generating high-quality 3D assets from textual descriptions remains a pivotal challenge in computer graphics and vision research. Due to the scarcity of 3D data, state-of-the-art approaches utilize pre-trained 2D diffusion priors, optimized through Score Distillation Sampling (SDS). Despite progress, crafting complex 3D scenes featuring multiple objects or intricate interactions is still difficult. To tackle this, recent methods have incorporated box or layout guidance. However, these layout-guided compositional methods often struggle to provide fine-grained control, as they are generally coarse and lack expressiveness. To overcome these challenges, we introduce a novel SDS approach, Semantic Score Distillation Sampling (SemanticSDS), designed to effectively improve the expressiveness and accuracy of compositional text-to-3D generation. Our approach integrates new semantic embeddings that maintain consistency across different rendering views and clearly differentiate between various objects and parts. These embeddings are transformed into a semantic map, which directs a region-specific SDS process, enabling precise optimization and compositional generation. By leveraging explicit semantic guidance, our method unlocks the compositional capabilities of existing pre-trained diffusion models, thereby achieving superior quality in 3D content generation, particularly for complex objects and scenes. Experimental results demonstrate that our SemanticSDS framework is highly effective for generating state-of-the-art complex 3D content. Code: https://github.com/YangLing0818/SemanticSDS-3D <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09009v1-abstract-full').style.display = 'none'; document.getElementById('2410.09009v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project: https://github.com/YangLing0818/SemanticSDS-3D</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08947">arXiv:2410.08947</a> <span> [<a href="https://arxiv.org/pdf/2410.08947">pdf</a>, <a href="https://arxiv.org/format/2410.08947">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Meta-Transfer Learning Empowered Temporal Graph Networks for Cross-City Real Estate Appraisal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weijia Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jindong Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wei Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08947v1-abstract-short" style="display: inline;"> Real estate appraisal is important for a variety of endeavors such as real estate deals, investment analysis, and real property taxation. Recently, deep learning has shown great promise for real estate appraisal by harnessing substantial online transaction data from web platforms. Nonetheless, deep learning is data-hungry, and thus it may not be trivially applicable to enormous small cities with l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08947v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08947v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08947v1-abstract-full" style="display: none;"> Real estate appraisal is important for a variety of endeavors such as real estate deals, investment analysis, and real property taxation. Recently, deep learning has shown great promise for real estate appraisal by harnessing substantial online transaction data from web platforms. Nonetheless, deep learning is data-hungry, and thus it may not be trivially applicable to enormous small cities with limited data. To this end, we propose Meta-Transfer Learning Empowered Temporal Graph Networks (MetaTransfer) to transfer valuable knowledge from multiple data-rich metropolises to the data-scarce city to improve valuation performance. Specifically, by modeling the ever-growing real estate transactions with associated residential communities as a temporal event heterogeneous graph, we first design an Event-Triggered Temporal Graph Network to model the irregular spatiotemporal correlations between evolving real estate transactions. Besides, we formulate the city-wide real estate appraisal as a multi-task dynamic graph link label prediction problem, where the valuation of each community in a city is regarded as an individual task. A Hypernetwork-Based Multi-Task Learning module is proposed to simultaneously facilitate intra-city knowledge sharing between multiple communities and task-specific parameters generation to accommodate the community-wise real estate price distribution. Furthermore, we propose a Tri-Level Optimization Based Meta- Learning framework to adaptively re-weight training transaction instances from multiple source cities to mitigate negative transfer, and thus improve the cross-city knowledge transfer effectiveness. Finally, extensive experiments based on five real-world datasets demonstrate the significant superiority of MetaTransfer compared with eleven baseline algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08947v1-abstract-full').style.display = 'none'; document.getElementById('2410.08947v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08164">arXiv:2410.08164</a> <span> [<a href="https://arxiv.org/pdf/2410.08164">pdf</a>, <a href="https://arxiv.org/format/2410.08164">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Agent S: An Open Agentic Framework that Uses Computers Like a Human </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Agashe%2C+S">Saaket Agashe</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiuzhou Han</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+S">Shuyu Gan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiachen Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Ang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X+E">Xin Eric Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08164v1-abstract-short" style="display: inline;"> We present Agent S, an open agentic framework that enables autonomous interaction with computers through a Graphical User Interface (GUI), aimed at transforming human-computer interaction by automating complex, multi-step tasks. Agent S aims to address three key challenges in automating computer tasks: acquiring domain-specific knowledge, planning over long task horizons, and handling dynamic, non… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08164v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08164v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08164v1-abstract-full" style="display: none;"> We present Agent S, an open agentic framework that enables autonomous interaction with computers through a Graphical User Interface (GUI), aimed at transforming human-computer interaction by automating complex, multi-step tasks. Agent S aims to address three key challenges in automating computer tasks: acquiring domain-specific knowledge, planning over long task horizons, and handling dynamic, non-uniform interfaces. To this end, Agent S introduces experience-augmented hierarchical planning, which learns from external knowledge search and internal experience retrieval at multiple levels, facilitating efficient task planning and subtask execution. In addition, it employs an Agent-Computer Interface (ACI) to better elicit the reasoning and control capabilities of GUI agents based on Multimodal Large Language Models (MLLMs). Evaluation on the OSWorld benchmark shows that Agent S outperforms the baseline by 9.37% on success rate (an 83.6% relative improvement) and achieves a new state-of-the-art. Comprehensive analysis highlights the effectiveness of individual components and provides insights for future improvements. Furthermore, Agent S demonstrates broad generalizability to different operating systems on a newly-released WindowsAgentArena benchmark. Code available at https://github.com/simular-ai/Agent-S. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08164v1-abstract-full').style.display = 'none'; document.getElementById('2410.08164v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 16 figures, 9 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07739">arXiv:2410.07739</a> <span> [<a href="https://arxiv.org/pdf/2410.07739">pdf</a>, <a href="https://arxiv.org/format/2410.07739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SLIM: Let LLM Learn More and Forget Less with Soft LoRA and Identity Mixture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiayi Han</a>, <a href="/search/cs?searchtype=author&query=Du%2C+L">Liang Du</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hongwei Du</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiangguo Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yiwen Wu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Weibo Zheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+D">Donghong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07739v1-abstract-short" style="display: inline;"> Although many efforts have been made, it is still a challenge to balance the training budget, downstream performance, and the general capabilities of the LLMs in many applications. Training the whole model for downstream tasks is expensive, and could easily result in catastrophic forgetting. By introducing parameter-efficient fine-tuning (PEFT), the training cost could be reduced, but it still suf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07739v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07739v1-abstract-full" style="display: none;"> Although many efforts have been made, it is still a challenge to balance the training budget, downstream performance, and the general capabilities of the LLMs in many applications. Training the whole model for downstream tasks is expensive, and could easily result in catastrophic forgetting. By introducing parameter-efficient fine-tuning (PEFT), the training cost could be reduced, but it still suffers from forgetting, and limits the learning on the downstream tasks. To efficiently fine-tune the LLMs with less limitation to their downstream performance while mitigating the forgetting of general capabilities, we propose a novel mixture of expert (MoE) framework based on Soft LoRA and Identity Mixture (SLIM), that allows dynamic routing between LoRA adapters and skipping connection, enables the suppression of forgetting. We adopt weight-yielding with sliding clustering for better out-of-domain distinguish to enhance the routing. We also propose to convert the mixture of low-rank adapters to the model merging formulation and introduce fast dynamic merging of LoRA adapters to keep the general capabilities of the base model. Extensive experiments demonstrate that the proposed SLIM is comparable to the state-of-the-art PEFT approaches on the downstream tasks while achieving the leading performance in mitigating catastrophic forgetting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07739v1-abstract-full').style.display = 'none'; document.getElementById('2410.07739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07157">arXiv:2410.07157</a> <span> [<a href="https://arxiv.org/pdf/2410.07157">pdf</a>, <a href="https://arxiv.org/format/2410.07157">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> InstructG2I: Synthesizing Images from Multimodal Attributed Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bowen Jin</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Z">Ziqi Pang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+B">Bingjun Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu-Xiong Wang</a>, <a href="/search/cs?searchtype=author&query=You%2C+J">Jiaxuan You</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07157v1-abstract-short" style="display: inline;"> In this paper, we approach an overlooked yet critical task Graph2Image: generating images from multimodal attributed graphs (MMAGs). This task poses significant challenges due to the explosion in graph size, dependencies among graph entities, and the need for controllability in graph conditions. To address these challenges, we propose a graph context-conditioned diffusion model called InstructG2I.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07157v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07157v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07157v1-abstract-full" style="display: none;"> In this paper, we approach an overlooked yet critical task Graph2Image: generating images from multimodal attributed graphs (MMAGs). This task poses significant challenges due to the explosion in graph size, dependencies among graph entities, and the need for controllability in graph conditions. To address these challenges, we propose a graph context-conditioned diffusion model called InstructG2I. InstructG2I first exploits the graph structure and multimodal information to conduct informative neighbor sampling by combining personalized page rank and re-ranking based on vision-language features. Then, a Graph-QFormer encoder adaptively encodes the graph nodes into an auxiliary set of graph prompts to guide the denoising process of diffusion. Finally, we propose graph classifier-free guidance, enabling controllable generation by varying the strength of graph guidance and multiple connected edges to a node. Extensive experiments conducted on three datasets from different domains demonstrate the effectiveness and controllability of our approach. The code is available at https://github.com/PeterGriffinJin/InstructG2I. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07157v1-abstract-full').style.display = 'none'; document.getElementById('2410.07157v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NeurIPs 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05983">arXiv:2410.05983</a> <span> [<a href="https://arxiv.org/pdf/2410.05983">pdf</a>, <a href="https://arxiv.org/format/2410.05983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Long-Context LLMs Meet RAG: Overcoming Challenges for Long Inputs in RAG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bowen Jin</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+J">Jinsung Yoon</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a>, <a href="/search/cs?searchtype=author&query=Arik%2C+S+O">Sercan O. Arik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05983v1-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) empowers large language models (LLMs) to utilize external knowledge sources. The increasing capacity of LLMs to process longer input sequences opens up avenues for providing more retrieved information, to potentially enhance the quality of generated outputs. It is plausible to assume that a larger retrieval set would contain more relevant information (higher re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05983v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05983v1-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) empowers large language models (LLMs) to utilize external knowledge sources. The increasing capacity of LLMs to process longer input sequences opens up avenues for providing more retrieved information, to potentially enhance the quality of generated outputs. It is plausible to assume that a larger retrieval set would contain more relevant information (higher recall), that might result in improved performance. However, our empirical findings demonstrate that for many long-context LLMs, the quality of generated output initially improves first, but then subsequently declines as the number of retrieved passages increases. This paper investigates this phenomenon, identifying the detrimental impact of retrieved "hard negatives" as a key contributor. To mitigate this and enhance the robustness of long-context LLM-based RAG, we propose both training-free and training-based approaches. We first showcase the effectiveness of retrieval reordering as a simple yet powerful training-free optimization. Furthermore, we explore training-based methods, specifically RAG-specific implicit LLM fine-tuning and RAG-oriented fine-tuning with intermediate reasoning, demonstrating their capacity for substantial performance gains. Finally, we conduct a systematic analysis of design choices for these training-based methods, including data distribution, retriever selection, and training context length. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05983v1-abstract-full').style.display = 'none'; document.getElementById('2410.05983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>