Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,186 results for author: <span class="mathjax">Zhang, M</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14279">arXiv:2411.14279</a> <span> [<a href="https://arxiv.org/pdf/2411.14279">pdf</a>, <a href="https://arxiv.org/format/2411.14279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Looking Beyond Text: Reducing Language bias in Large Vision-Language Models via Multimodal Dual-Attention and Soft-Image Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haozhe Zhao</a>, <a href="/search/cs?searchtype=author&query=Si%2C+S">Shuzheng Si</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+B">Baobao Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14279v1-abstract-short" style="display: inline;"> Large vision-language models (LVLMs) have achieved impressive results in various vision-language tasks. However, despite showing promising performance, LVLMs suffer from hallucinations caused by language bias, leading to diminished focus on images and ineffective visual comprehension. We identify two primary reasons for this bias: 1. Different scales of training data between the pretraining stage… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14279v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14279v1-abstract-full" style="display: none;"> Large vision-language models (LVLMs) have achieved impressive results in various vision-language tasks. However, despite showing promising performance, LVLMs suffer from hallucinations caused by language bias, leading to diminished focus on images and ineffective visual comprehension. We identify two primary reasons for this bias: 1. Different scales of training data between the pretraining stage of LLM and multimodal alignment stage. 2. The learned inference bias due to short-term dependency of text data. Therefore, we propose LACING, a systemic framework designed to address the language bias of LVLMs with muLtimodal duAl-attention meChanIsm (MDA) aNd soft-image Guidance (IFG). Specifically, MDA introduces a parallel dual-attention mechanism that enhances the integration of visual inputs across the model. IFG introduces a learnable soft visual prompt during training and inference to replace visual inputs, designed to compel LVLMs to prioritize text inputs. Then, IFG further proposes a novel decoding strategy using the soft visual prompt to mitigate the model's over-reliance on adjacent text inputs. Comprehensive experiments demonstrate that our method effectively debiases LVLMs from their language bias, enhancing visual comprehension and reducing hallucinations without requiring additional training resources or data. The code and model are available at [lacing-lvlm.github.io](https://lacing-lvlm.github.io). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14279v1-abstract-full').style.display = 'none'; document.getElementById('2411.14279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14119">arXiv:2411.14119</a> <span> [<a href="https://arxiv.org/pdf/2411.14119">pdf</a>, <a href="https://arxiv.org/format/2411.14119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Aware Regression for Socio-Economic Estimation via Multi-View Remote Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Ishida%2C+S">Sahoko Ishida</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Jenson%2C+D">Daniel Jenson</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+S">Swapnil Mishra</a>, <a href="/search/cs?searchtype=author&query=Navott%2C+J">Jhonathan Navott</a>, <a href="/search/cs?searchtype=author&query=Flaxman%2C+S">Seth Flaxman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14119v1-abstract-short" style="display: inline;"> Remote sensing imagery offers rich spectral data across extensive areas for Earth observation. Many attempts have been made to leverage these data with transfer learning to develop scalable alternatives for estimating socio-economic conditions, reducing reliance on expensive survey-collected data. However, much of this research has primarily focused on daytime satellite imagery due to the limitati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14119v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14119v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14119v1-abstract-full" style="display: none;"> Remote sensing imagery offers rich spectral data across extensive areas for Earth observation. Many attempts have been made to leverage these data with transfer learning to develop scalable alternatives for estimating socio-economic conditions, reducing reliance on expensive survey-collected data. However, much of this research has primarily focused on daytime satellite imagery due to the limitation that most pre-trained models are trained on 3-band RGB images. Consequently, modeling techniques for spectral bands beyond the visible spectrum have not been thoroughly investigated. Additionally, quantifying uncertainty in remote sensing regression has been less explored, yet it is essential for more informed targeting and iterative collection of ground truth survey data. In this paper, we introduce a novel framework that leverages generic foundational vision models to process remote sensing imagery using combinations of three spectral bands to exploit multi-spectral data. We also employ methods such as heteroscedastic regression and Bayesian modeling to generate uncertainty estimates for the predictions. Experimental results demonstrate that our method outperforms existing models that use RGB or multi-spectral models with unstructured band usage. Moreover, our framework helps identify uncertain predictions, guiding future ground truth data acquisition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14119v1-abstract-full').style.display = 'none'; document.getElementById('2411.14119v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14055">arXiv:2411.14055</a> <span> [<a href="https://arxiv.org/pdf/2411.14055">pdf</a>, <a href="https://arxiv.org/format/2411.14055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DRPruning: Efficient Large Language Model Pruning through Distributionally Robust Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hexuan Deng</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+W">Wenxiang Jiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuebo Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhaopeng Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14055v1-abstract-short" style="display: inline;"> Large language models (LLMs) deliver impressive results but face challenges from increasing model sizes and computational costs. Structured pruning reduces model size and speeds up inference but often causes uneven degradation across domains, leading to biased performance. To address this, we propose DRPruning, which incorporates distributionally robust optimization to restore balanced performance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14055v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14055v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14055v1-abstract-full" style="display: none;"> Large language models (LLMs) deliver impressive results but face challenges from increasing model sizes and computational costs. Structured pruning reduces model size and speeds up inference but often causes uneven degradation across domains, leading to biased performance. To address this, we propose DRPruning, which incorporates distributionally robust optimization to restore balanced performance across domains, along with further improvements to enhance robustness. Experiments in monolingual and multilingual settings show that our method surpasses similarly sized models in pruning and continued pretraining over perplexity, downstream tasks, and instruction tuning. We further provide analysis demonstrating the robustness of our method towards various domains and distribution shifts. Furthermore, our method automatically determines optimal reference losses and data ratios, suggesting potential for broader applications. Our code is available at https://github.com/hexuandeng/DRPruning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14055v1-abstract-full').style.display = 'none'; document.getElementById('2411.14055v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in Progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13715">arXiv:2411.13715</a> <span> [<a href="https://arxiv.org/pdf/2411.13715">pdf</a>, <a href="https://arxiv.org/format/2411.13715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SimPhony: A Device-Circuit-Architecture Cross-Layer Modeling and Simulation Framework for Heterogeneous Electronic-Photonic AI System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziang Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/cs?searchtype=author&query=Begovic%2C+A">Amir Begovic</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rena Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jeff Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13715v1-abstract-short" style="display: inline;"> Electronic-photonic integrated circuits (EPICs) offer transformative potential for next-generation high-performance AI but require interdisciplinary advances across devices, circuits, architecture, and design automation. The complexity of hybrid systems makes it challenging even for domain experts to understand distinct behaviors and interactions across design stack. The lack of a flexible, accura… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13715v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13715v1-abstract-full" style="display: none;"> Electronic-photonic integrated circuits (EPICs) offer transformative potential for next-generation high-performance AI but require interdisciplinary advances across devices, circuits, architecture, and design automation. The complexity of hybrid systems makes it challenging even for domain experts to understand distinct behaviors and interactions across design stack. The lack of a flexible, accurate, fast, and easy-to-use EPIC AI system simulation framework significantly limits the exploration of hardware innovations and system evaluations on common benchmarks. To address this gap, we propose SimPhony, a cross-layer modeling and simulation framework for heterogeneous electronic-photonic AI systems. SimPhony offers a platform that enables (1) generic, extensible hardware topology representation that supports heterogeneous multi-core architectures with diverse photonic tensor core designs; (2) optics-specific dataflow modeling with unique multi-dimensional parallelism and reuse beyond spatial/temporal dimensions; (3) data-aware energy modeling with realistic device responses, layout-aware area estimation, link budget analysis, and bandwidth-adaptive memory modeling; and (4) seamless integration with model training framework for hardware/software co-simulation. By providing a unified, versatile, and high-fidelity simulation platform, SimPhony enables researchers to innovate and evaluate EPIC AI hardware across multiple domains, facilitating the next leap in emerging AI hardware. We open-source our codes at https://github.com/ScopeX-ASU/SimPhony <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13715v1-abstract-full').style.display = 'none'; document.getElementById('2411.13715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7-page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13239">arXiv:2411.13239</a> <span> [<a href="https://arxiv.org/pdf/2411.13239">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Transforming the Hybrid Cloud for Emerging AI Workloads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Deming Chen</a>, <a href="/search/cs?searchtype=author&query=Youssef%2C+A">Alaa Youssef</a>, <a href="/search/cs?searchtype=author&query=Pendse%2C+R">Ruchi Pendse</a>, <a href="/search/cs?searchtype=author&query=Schleife%2C+A">Andr茅 Schleife</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+B+K">Bryan K. Clark</a>, <a href="/search/cs?searchtype=author&query=Hamann%2C+H">Hendrik Hamann</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jingrui He</a>, <a href="/search/cs?searchtype=author&query=Laino%2C+T">Teodoro Laino</a>, <a href="/search/cs?searchtype=author&query=Varshney%2C+L">Lav Varshney</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxiong Wang</a>, <a href="/search/cs?searchtype=author&query=Sil%2C+A">Avirup Sil</a>, <a href="/search/cs?searchtype=author&query=Jabbarvand%2C+R">Reyhaneh Jabbarvand</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianyin Xu</a>, <a href="/search/cs?searchtype=author&query=Kindratenko%2C+V">Volodymyr Kindratenko</a>, <a href="/search/cs?searchtype=author&query=Costa%2C+C">Carlos Costa</a>, <a href="/search/cs?searchtype=author&query=Adve%2C+S">Sarita Adve</a>, <a href="/search/cs?searchtype=author&query=Mendis%2C+C">Charith Mendis</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=N%C3%BA%C3%B1ez-Corrales%2C+S">Santiago N煤帽ez-Corrales</a>, <a href="/search/cs?searchtype=author&query=Ganti%2C+R">Raghu Ganti</a>, <a href="/search/cs?searchtype=author&query=Srivatsa%2C+M">Mudhakar Srivatsa</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+N+S">Nam Sung Kim</a>, <a href="/search/cs?searchtype=author&query=Torrellas%2C+J">Josep Torrellas</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a>, <a href="/search/cs?searchtype=author&query=Seelam%2C+S">Seetharami Seelam</a> , et al. (19 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13239v1-abstract-short" style="display: inline;"> This white paper, developed through close collaboration between IBM Research and UIUC researchers within the IIDAI Institute, envisions transforming hybrid cloud systems to meet the growing complexity of AI workloads through innovative, full-stack co-design approaches, emphasizing usability, manageability, affordability, adaptability, efficiency, and scalability. By integrating cutting-edge techno… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13239v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13239v1-abstract-full" style="display: none;"> This white paper, developed through close collaboration between IBM Research and UIUC researchers within the IIDAI Institute, envisions transforming hybrid cloud systems to meet the growing complexity of AI workloads through innovative, full-stack co-design approaches, emphasizing usability, manageability, affordability, adaptability, efficiency, and scalability. By integrating cutting-edge technologies such as generative and agentic AI, cross-layer automation and optimization, unified control plane, and composable and adaptive system architecture, the proposed framework addresses critical challenges in energy efficiency, performance, and cost-effectiveness. Incorporating quantum computing as it matures will enable quantum-accelerated simulations for materials science, climate modeling, and other high-impact domains. Collaborative efforts between academia and industry are central to this vision, driving advancements in foundation models for material design and climate solutions, scalable multimodal data processing, and enhanced physics-based AI emulators for applications like weather forecasting and carbon sequestration. Research priorities include advancing AI agentic systems, LLM as an Abstraction (LLMaaA), AI model optimization and unified abstractions across heterogeneous infrastructure, end-to-end edge-cloud transformation, efficient programming model, middleware and platform, secure infrastructure, application-adaptive cloud systems, and new quantum-classical collaborative workflows. These ideas and solutions encompass both theoretical and practical research questions, requiring coordinated input and support from the research community. This joint initiative aims to establish hybrid clouds as secure, efficient, and sustainable platforms, fostering breakthroughs in AI-driven applications and scientific discovery across academia, industry, and society. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13239v1-abstract-full').style.display = 'none'; document.getElementById('2411.13239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">70 pages, 27 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13159">arXiv:2411.13159</a> <span> [<a href="https://arxiv.org/pdf/2411.13159">pdf</a>, <a href="https://arxiv.org/format/2411.13159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Hard-Synth: Synthesizing Diverse Hard Samples for ASR using Zero-Shot TTS and LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiawei Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuang Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaosong Qiao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiaofeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jinsong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13159v1-abstract-short" style="display: inline;"> Text-to-speech (TTS) models have been widely adopted to enhance automatic speech recognition (ASR) systems using text-only corpora, thereby reducing the cost of labeling real speech data. Existing research primarily utilizes additional text data and predefined speech styles supported by TTS models. In this paper, we propose Hard-Synth, a novel ASR data augmentation method that leverages large lang… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13159v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13159v1-abstract-full" style="display: none;"> Text-to-speech (TTS) models have been widely adopted to enhance automatic speech recognition (ASR) systems using text-only corpora, thereby reducing the cost of labeling real speech data. Existing research primarily utilizes additional text data and predefined speech styles supported by TTS models. In this paper, we propose Hard-Synth, a novel ASR data augmentation method that leverages large language models (LLMs) and advanced zero-shot TTS. Our approach employs LLMs to generate diverse in-domain text through rewriting, without relying on additional text data. Rather than using predefined speech styles, we introduce a hard prompt selection method with zero-shot TTS to clone speech styles that the ASR model finds challenging to recognize. Experiments demonstrate that Hard-Synth significantly enhances the Conformer model, achieving relative word error rate (WER) reductions of 6.5\%/4.4\% on LibriSpeech dev/test-other subsets. Additionally, we show that Hard-Synth is data-efficient and capable of reducing bias in ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13159v1-abstract-full').style.display = 'none'; document.getElementById('2411.13159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13069">arXiv:2411.13069</a> <span> [<a href="https://arxiv.org/pdf/2411.13069">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automatic marker-free registration based on similar tetrahedras for single-tree point clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jing Ren</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hanlong Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuhan Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuhang Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenxin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingtai Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lingyun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13069v1-abstract-short" style="display: inline;"> In recent years, terrestrial laser scanning technology has been widely used to collect tree point cloud data, aiding in measurements of diameter at breast height, biomass, and other forestry survey data. Since a single scan from terrestrial laser systems captures data from only one angle, multiple scans must be registered and fused to obtain complete tree point cloud data. This paper proposes a ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13069v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13069v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13069v1-abstract-full" style="display: none;"> In recent years, terrestrial laser scanning technology has been widely used to collect tree point cloud data, aiding in measurements of diameter at breast height, biomass, and other forestry survey data. Since a single scan from terrestrial laser systems captures data from only one angle, multiple scans must be registered and fused to obtain complete tree point cloud data. This paper proposes a marker-free automatic registration method for single-tree point clouds based on similar tetrahedras. First, two point clouds from two scans of the same tree are used to generate tree skeletons, and key point sets are constructed from these skeletons. Tetrahedra are then filtered and matched according to similarity principles, with the vertices of these two matched tetrahedras selected as matching point pairs, thus completing the coarse registration of the point clouds from the two scans. Subsequently, the ICP method is applied to the coarse-registered leaf point clouds to obtain fine registration parameters, completing the precise registration of the two tree point clouds. Experiments were conducted using terrestrial laser scanning data from eight trees, each from different species and with varying shapes. The proposed method was evaluated using RMSE and Hausdorff distance, compared against the traditional ICP and NDT methods. The experimental results demonstrate that the proposed method significantly outperforms both ICP and NDT in registration accuracy, achieving speeds up to 593 times and 113 times faster than ICP and NDT, respectively. In summary, the proposed method shows good robustness in single-tree point cloud registration, with significant advantages in accuracy and speed compared to traditional ICP and NDT methods, indicating excellent application prospects in practical registration scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13069v1-abstract-full').style.display = 'none'; document.getElementById('2411.13069v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">remote sensing; terrestrial lidar; multi-scan cloud registration</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12502">arXiv:2411.12502</a> <span> [<a href="https://arxiv.org/pdf/2411.12502">pdf</a>, <a href="https://arxiv.org/format/2411.12502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Transformer Neural Processes -- Kernel Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jenson%2C+D">Daniel Jenson</a>, <a href="/search/cs?searchtype=author&query=Navott%2C+J">Jhonathan Navott</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+M">Makkunda Sharma</a>, <a href="/search/cs?searchtype=author&query=Semenova%2C+E">Elizaveta Semenova</a>, <a href="/search/cs?searchtype=author&query=Flaxman%2C+S">Seth Flaxman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12502v1-abstract-short" style="display: inline;"> Stochastic processes model various natural phenomena from disease transmission to stock prices, but simulating and quantifying their uncertainty can be computationally challenging. For example, modeling a Gaussian Process with standard statistical methods incurs an $\mathcal{O}(n^3)$ penalty, and even using state-of-the-art Neural Processes (NPs) incurs an $\mathcal{O}(n^2)$ penalty due to the att… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12502v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12502v1-abstract-full" style="display: none;"> Stochastic processes model various natural phenomena from disease transmission to stock prices, but simulating and quantifying their uncertainty can be computationally challenging. For example, modeling a Gaussian Process with standard statistical methods incurs an $\mathcal{O}(n^3)$ penalty, and even using state-of-the-art Neural Processes (NPs) incurs an $\mathcal{O}(n^2)$ penalty due to the attention mechanism. We introduce the Transformer Neural Process - Kernel Regression (TNP-KR), a new architecture that incorporates a novel transformer block we call a Kernel Regression Block (KRBlock), which reduces the computational complexity of attention in transformer-based Neural Processes (TNPs) from $\mathcal{O}((n_C+n_T)^2)$ to $O(n_C^2+n_Cn_T)$ by eliminating masked computations, where $n_C$ is the number of context, and $n_T$ is the number of test points, respectively, and a fast attention variant that further reduces all attention calculations to $\mathcal{O}(n_C)$ in space and time complexity. In benchmarks spanning such tasks as meta-regression, Bayesian optimization, and image completion, we demonstrate that the full variant matches the performance of state-of-the-art methods while training faster and scaling two orders of magnitude higher in number of test points, and the fast variant nearly matches that performance while scaling to millions of both test and context points on consumer hardware. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12502v1-abstract-full').style.display = 'none'; document.getElementById('2411.12502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12201">arXiv:2411.12201</a> <span> [<a href="https://arxiv.org/pdf/2411.12201">pdf</a>, <a href="https://arxiv.org/format/2411.12201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Invariant Shape Representation Learning For Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hossain%2C+T">Tonmoy Hossain</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miaomiao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12201v1-abstract-short" style="display: inline;"> Geometric shape features have been widely used as strong predictors for image classification. Nevertheless, most existing classifiers such as deep neural networks (DNNs) directly leverage the statistical correlations between these shape features and target variables. However, these correlations can often be spurious and unstable across different environments (e.g., in different age groups, certain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12201v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12201v1-abstract-full" style="display: none;"> Geometric shape features have been widely used as strong predictors for image classification. Nevertheless, most existing classifiers such as deep neural networks (DNNs) directly leverage the statistical correlations between these shape features and target variables. However, these correlations can often be spurious and unstable across different environments (e.g., in different age groups, certain types of brain changes have unstable relations with neurodegenerative disease); hence leading to biased or inaccurate predictions. In this paper, we introduce a novel framework that for the first time develops invariant shape representation learning (ISRL) to further strengthen the robustness of image classifiers. In contrast to existing approaches that mainly derive features in the image space, our model ISRL is designed to jointly capture invariant features in latent shape spaces parameterized by deformable transformations. To achieve this goal, we develop a new learning paradigm based on invariant risk minimization (IRM) to learn invariant representations of image and shape features across multiple training distributions/environments. By embedding the features that are invariant with regard to target variables in different environments, our model consistently offers more accurate predictions. We validate our method by performing classification tasks on both simulated 2D images, real 3D brain and cine cardiovascular magnetic resonance images (MRIs). Our code is publicly available at https://github.com/tonmoy-hossain/ISRL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12201v1-abstract-full').style.display = 'none'; document.getElementById('2411.12201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11845">arXiv:2411.11845</a> <span> [<a href="https://arxiv.org/pdf/2411.11845">pdf</a>, <a href="https://arxiv.org/format/2411.11845">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> UniHands: Unifying Various Wild-Collected Keypoints for Personalized Hand Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Menghe Zhang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Joonyeoup Kim</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yangwen Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuangquan Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+K">Kee-Bong Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11845v1-abstract-short" style="display: inline;"> Accurate hand motion capture and standardized 3D representation are essential for various hand-related tasks. Collecting keypoints-only data, while efficient and cost-effective, results in low-fidelity representations and lacks surface information. Furthermore, data inconsistencies across sources challenge their integration and use. We present UniHands, a novel method for creating standardized yet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11845v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11845v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11845v1-abstract-full" style="display: none;"> Accurate hand motion capture and standardized 3D representation are essential for various hand-related tasks. Collecting keypoints-only data, while efficient and cost-effective, results in low-fidelity representations and lacks surface information. Furthermore, data inconsistencies across sources challenge their integration and use. We present UniHands, a novel method for creating standardized yet personalized hand models from wild-collected keypoints from diverse sources. Unlike existing neural implicit representation methods, UniHands uses the widely-adopted parametric models MANO and NIMBLE, providing a more scalable and versatile solution. It also derives unified hand joints from the meshes, which facilitates seamless integration into various hand-related tasks. Experiments on the FreiHAND and InterHand2.6M datasets demonstrate its ability to precisely reconstruct hand mesh vertices and keypoints, effectively capturing high-degree articulation motions. Empirical studies involving nine participants show a clear preference for our unified joints over existing configurations for accuracy and naturalism (p-value 0.016). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11845v1-abstract-full').style.display = 'none'; document.getElementById('2411.11845v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11525">arXiv:2411.11525</a> <span> [<a href="https://arxiv.org/pdf/2411.11525">pdf</a>, <a href="https://arxiv.org/format/2411.11525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reliable Poisoned Sample Detection against Backdoor Attacks Enhanced by Sharpness Aware Minimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingda Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingli Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Baoyuan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11525v1-abstract-short" style="display: inline;"> Backdoor attack has been considered as a serious security threat to deep neural networks (DNNs). Poisoned sample detection (PSD) that aims at filtering out poisoned samples from an untrustworthy training dataset has shown very promising performance for defending against data poisoning based backdoor attacks. However, we observe that the detection performance of many advanced methods is likely to b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11525v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11525v1-abstract-full" style="display: none;"> Backdoor attack has been considered as a serious security threat to deep neural networks (DNNs). Poisoned sample detection (PSD) that aims at filtering out poisoned samples from an untrustworthy training dataset has shown very promising performance for defending against data poisoning based backdoor attacks. However, we observe that the detection performance of many advanced methods is likely to be unstable when facing weak backdoor attacks, such as low poisoning ratio or weak trigger strength. To further verify this observation, we make a statistical investigation among various backdoor attacks and poisoned sample detections, showing a positive correlation between backdoor effect and detection performance. It inspires us to strengthen the backdoor effect to enhance detection performance. Since we cannot achieve that goal via directly manipulating poisoning ratio or trigger strength, we propose to train one model using the Sharpness-Aware Minimization (SAM) algorithm, rather than the vanilla training algorithm. We also provide both empirical and theoretical analysis about how SAM training strengthens the backdoor effect. Then, this SAM trained model can be seamlessly integrated with any off-the-shelf PSD method that extracts discriminative features from the trained model for detection, called SAM-enhanced PSD. Extensive experiments on several benchmark datasets show the reliable detection performance of the proposed method against both weak and strong backdoor attacks, with significant improvements against various attacks ($+34.38\%$ TPR on average), over the conventional PSD methods (i.e., without SAM enhancement). Overall, this work provides new insights about PSD and proposes a novel approach that can complement existing detection methods, which may inspire more in-depth explorations in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11525v1-abstract-full').style.display = 'none'; document.getElementById('2411.11525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10329">arXiv:2411.10329</a> <span> [<a href="https://arxiv.org/pdf/2411.10329">pdf</a>, <a href="https://arxiv.org/format/2411.10329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Safe Text-to-Image Generation: Simply Sanitize the Prompt Embedding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+H">Huming Qiu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanxu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10329v1-abstract-short" style="display: inline;"> In recent years, text-to-image (T2I) generation models have made significant progress in generating high-quality images that align with text descriptions. However, these models also face the risk of unsafe generation, potentially producing harmful content that violates usage policies, such as explicit material. Existing safe generation methods typically focus on suppressing inappropriate content b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10329v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10329v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10329v1-abstract-full" style="display: none;"> In recent years, text-to-image (T2I) generation models have made significant progress in generating high-quality images that align with text descriptions. However, these models also face the risk of unsafe generation, potentially producing harmful content that violates usage policies, such as explicit material. Existing safe generation methods typically focus on suppressing inappropriate content by erasing undesired concepts from visual representations, while neglecting to sanitize the textual representation. Although these methods help mitigate the risk of misuse to certain extent, their robustness remains insufficient when dealing with adversarial attacks. Given that semantic consistency between input text and output image is a fundamental requirement for T2I models, we identify that textual representations (i.e., prompt embeddings) are likely the primary source of unsafe generation. To this end, we propose a vision-agnostic safe generation framework, Embedding Sanitizer (ES), which focuses on erasing inappropriate concepts from prompt embeddings and uses the sanitized embeddings to guide the model for safe generation. ES is applied to the output of the text encoder as a plug-and-play module, enabling seamless integration with different T2I models as well as other safeguards. In addition, ES's unique scoring mechanism assigns a score to each token in the prompt to indicate its potential harmfulness, and dynamically adjusts the sanitization intensity to balance defensive performance and generation quality. Through extensive evaluation on five prompt benchmarks, our approach achieves state-of-the-art robustness by sanitizing the source (prompt embedding) of unsafe generation compared to nine baseline methods. It significantly outperforms existing safeguards in terms of interpretability and controllability while maintaining generation quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10329v1-abstract-full').style.display = 'none'; document.getElementById('2411.10329v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09492">arXiv:2411.09492</a> <span> [<a href="https://arxiv.org/pdf/2411.09492">pdf</a>, <a href="https://arxiv.org/format/2411.09492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MM-Eval: A Hierarchical Benchmark for Modern Mongolian Evaluation in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruihui Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+B">Bo Xia</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiaobing Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09492v1-abstract-short" style="display: inline;"> Large language models (LLMs) excel in high-resource languages but face notable challenges in low-resource languages like Mongolian. This paper addresses these challenges by categorizing capabilities into language abilities (syntax and semantics) and cognitive abilities (knowledge and reasoning). To systematically evaluate these areas, we developed MM-Eval, a specialized dataset based on Modern Mon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09492v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09492v1-abstract-full" style="display: none;"> Large language models (LLMs) excel in high-resource languages but face notable challenges in low-resource languages like Mongolian. This paper addresses these challenges by categorizing capabilities into language abilities (syntax and semantics) and cognitive abilities (knowledge and reasoning). To systematically evaluate these areas, we developed MM-Eval, a specialized dataset based on Modern Mongolian Language Textbook I and enriched with WebQSP and MGSM datasets. Preliminary experiments on models including Qwen2-7B-Instruct, GLM4-9b-chat, Llama3.1-8B-Instruct, GPT-4, and DeepseekV2.5 revealed that: 1) all models performed better on syntactic tasks than semantic tasks, highlighting a gap in deeper language understanding; and 2) knowledge tasks showed a moderate decline, suggesting that models can transfer general knowledge from high-resource to low-resource contexts. The release of MM-Eval, comprising 569 syntax, 677 semantics, 344 knowledge, and 250 reasoning tasks, offers valuable insights for advancing NLP and LLMs in low-resource languages like Mongolian. The dataset is available at https://github.com/joenahm/MM-Eval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09492v1-abstract-full').style.display = 'none'; document.getElementById('2411.09492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09259">arXiv:2411.09259</a> <span> [<a href="https://arxiv.org/pdf/2411.09259">pdf</a>, <a href="https://arxiv.org/format/2411.09259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Jailbreak Attacks and Defenses against Multimodal Generative Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuannan Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+X">Xing Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peipei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zekun Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huaibo Huang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuhan Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miaoxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yueying Zou</a>, <a href="/search/cs?searchtype=author&query=He%2C+R">Ran He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09259v1-abstract-short" style="display: inline;"> The rapid evolution of multimodal foundation models has led to significant advancements in cross-modal understanding and generation across diverse modalities, including text, images, audio, and video. However, these models remain susceptible to jailbreak attacks, which can bypass built-in safety mechanisms and induce the production of potentially harmful content. Consequently, understanding the me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09259v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09259v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09259v1-abstract-full" style="display: none;"> The rapid evolution of multimodal foundation models has led to significant advancements in cross-modal understanding and generation across diverse modalities, including text, images, audio, and video. However, these models remain susceptible to jailbreak attacks, which can bypass built-in safety mechanisms and induce the production of potentially harmful content. Consequently, understanding the methods of jailbreak attacks and existing defense mechanisms is essential to ensure the safe deployment of multimodal generative models in real-world scenarios, particularly in security-sensitive applications. To provide comprehensive insight into this topic, this survey reviews jailbreak and defense in multimodal generative models. First, given the generalized lifecycle of multimodal jailbreak, we systematically explore attacks and corresponding defense strategies across four levels: input, encoder, generator, and output. Based on this analysis, we present a detailed taxonomy of attack methods, defense mechanisms, and evaluation frameworks specific to multimodal generative models. Additionally, we cover a wide range of input-output configurations, including modalities such as Any-to-Text, Any-to-Vision, and Any-to-Any within generative systems. Finally, we highlight current research challenges and propose potential directions for future research.The open-source repository corresponding to this work can be found at https://github.com/liuxuannan/Awesome-Multimodal-Jailbreak. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09259v1-abstract-full').style.display = 'none'; document.getElementById('2411.09259v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ongoing work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09176">arXiv:2411.09176</a> <span> [<a href="https://arxiv.org/pdf/2411.09176">pdf</a>, <a href="https://arxiv.org/format/2411.09176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Gazing at Rewards: Eye Movements as a Lens into Human and AI Decision-Making in Hybrid Visual Foraging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+D">Dingwei Tan</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+Y">Yen-Ling Kuo</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhaowei Sun</a>, <a href="/search/cs?searchtype=author&query=Wolfe%2C+J+M">Jeremy M. Wolfe</a>, <a href="/search/cs?searchtype=author&query=Cham%2C+T">Tat-Jen Cham</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengmi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09176v2-abstract-short" style="display: inline;"> Imagine searching a collection of coins for quarters ($0.25$), dimes ($0.10$), nickels ($0.05$), and pennies ($0.01$)-a hybrid foraging task where observers look for multiple instances of multiple target types. In such tasks, how do target values and their prevalence influence foraging and eye movement behaviors (e.g., should you prioritize rare quarters or common nickels)? To explore this, we con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09176v2-abstract-full').style.display = 'inline'; document.getElementById('2411.09176v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09176v2-abstract-full" style="display: none;"> Imagine searching a collection of coins for quarters ($0.25$), dimes ($0.10$), nickels ($0.05$), and pennies ($0.01$)-a hybrid foraging task where observers look for multiple instances of multiple target types. In such tasks, how do target values and their prevalence influence foraging and eye movement behaviors (e.g., should you prioritize rare quarters or common nickels)? To explore this, we conducted human psychophysics experiments, revealing that humans are proficient reward foragers. Their eye fixations are drawn to regions with higher average rewards, fixation durations are longer on more valuable targets, and their cumulative rewards exceed chance, approaching the upper bound of optimal foragers. To probe these decision-making processes of humans, we developed a transformer-based Visual Forager (VF) model trained via reinforcement learning. Our VF model takes a series of targets, their corresponding values, and the search image as inputs, processes the images using foveated vision, and produces a sequence of eye movements along with decisions on whether to collect each fixated item. Our model outperforms all baselines, achieves cumulative rewards comparable to those of humans, and approximates human foraging behavior in eye movements and foraging biases within time-limited environments. Furthermore, stress tests on out-of-distribution tasks with novel targets, unseen values, and varying set sizes demonstrate the VF model's effective generalization. Our work offers valuable insights into the relationship between eye movements and decision-making, with our model serving as a powerful tool for further exploration of this connection. All data, code, and models will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09176v2-abstract-full').style.display = 'none'; document.getElementById('2411.09176v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08724">arXiv:2411.08724</a> <span> [<a href="https://arxiv.org/pdf/2411.08724">pdf</a>, <a href="https://arxiv.org/format/2411.08724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> QCG-Rerank: Chunks Graph Rerank with Query Expansion in Retrieval-Augmented LLMs for Tourism Domain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+Q">Qikai Wei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mingzhi Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Chunlong Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jingfu Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minghao Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Feifei Shi</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+H">Huansheng Ning</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08724v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) mitigates the issue of hallucination in Large Language Models (LLMs) by integrating information retrieval techniques. However, in the tourism domain, since the query is usually brief and the content in the database is diverse, existing RAG may contain a significant amount of irrelevant or contradictory information contents after retrieval. To address this chall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08724v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08724v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08724v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) mitigates the issue of hallucination in Large Language Models (LLMs) by integrating information retrieval techniques. However, in the tourism domain, since the query is usually brief and the content in the database is diverse, existing RAG may contain a significant amount of irrelevant or contradictory information contents after retrieval. To address this challenge, we propose the QCG-Rerank model. This model first performs an initial retrieval to obtain candidate chunks and then enhances semantics by extracting critical information to expand the original query. Next, we utilize the expanded query and candidate chunks to calculate similarity scores as the initial transition probability and construct the chunks graph. Subsequently, We iteratively compute the transition probabilities based on an initial estimate until convergence. The chunks with the highest score are selected and input into the LLMs to generate responses. We evaluate the model on Cultour, IIRC, StrategyQA, HotpotQA, SQuAD, and MuSiQue datasets. The experimental results demonstrate the effectiveness and superiority of the QCG-Rerank method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08724v1-abstract-full').style.display = 'none'; document.getElementById('2411.08724v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08312">arXiv:2411.08312</a> <span> [<a href="https://arxiv.org/pdf/2411.08312">pdf</a>, <a href="https://arxiv.org/format/2411.08312">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> A Novel Extensible Simulation Framework for CXL-Enabled Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+Y">Yuda An</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+S">Shushu Yi</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+B">Bo Mao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qiao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingzhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Ke Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+N">Nong Xiao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+G">Guangyu Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolin Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yingwei Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08312v1-abstract-short" style="display: inline;"> Compute Express Link (CXL) serves as a rising industry standard, delivering high-speed cache-coherent links to a variety of devices, including host CPUs, computational accelerators, and memory devices. It is designed to promote system scalability, enable peer-to-peer exchanges, and accelerate data transmissions. To achieve these objectives, the most recent CXL protocol has brought forth several in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08312v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08312v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08312v1-abstract-full" style="display: none;"> Compute Express Link (CXL) serves as a rising industry standard, delivering high-speed cache-coherent links to a variety of devices, including host CPUs, computational accelerators, and memory devices. It is designed to promote system scalability, enable peer-to-peer exchanges, and accelerate data transmissions. To achieve these objectives, the most recent CXL protocol has brought forth several innovative features, such as port-focused routing, device-handled coherence, and PCIe 6.0 compatibility. However, due to the limited availability of hardware prototypes and simulators compatible with CXL, earlier CXL research has largely depended on emulating CXL devices using remote NUMA nodes. Unfortunately, these NUMA-based emulators have difficulties in accurately representing the new features due to fundamental differences in hardware and protocols. Moreover, the absence of support for non-tree topology and PCIe links makes it complex to merely adapt existing simulators for CXL simulation. To overcome these problems, we introduce ESF, a simulation framework specifically designed for CXL systems. ESF has been developed to accurately reflect the unique features of the latest CXL protocol from the ground up. It uses a specialized interconnect layer to facilitate connections within a wide range of system topologies and also includes key components to carry out specific functions required by these features. By utilizing ESF, we thoroughly investigate various aspects of CXL systems, including system topology, device-handled coherence, and the effects of PCIe characteristics, leading to important findings that can guide the creation of high-performance CXL systems. The ESF source codes are fully open-source and can be accessed at https://anonymous.4open.science/r/ESF-1CE3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08312v1-abstract-full').style.display = 'none'; document.getElementById('2411.08312v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08210">arXiv:2411.08210</a> <span> [<a href="https://arxiv.org/pdf/2411.08210">pdf</a>, <a href="https://arxiv.org/format/2411.08210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> BOSON$^{-1}$: Understanding and Enabling Physically-Robust Photonic Inverse Design with Adaptive Variation-Aware Subspace Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhengqi Gao</a>, <a href="/search/cs?searchtype=author&query=Begovic%2C+A">Amir Begovic</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haoyu Yang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Haoxing Ren</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z+R">Zhaoran Rena Huang</a>, <a href="/search/cs?searchtype=author&query=Boning%2C+D">Duane Boning</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08210v1-abstract-short" style="display: inline;"> Nanophotonic device design aims to optimize photonic structures to meet specific requirements across various applications. Inverse design has unlocked non-intuitive, high-dimensional design spaces, enabling the discovery of high-performance devices beyond heuristic or analytic methods. The adjoint method, which calculates gradients for all variables using just two simulations, enables efficient na… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08210v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08210v1-abstract-full" style="display: none;"> Nanophotonic device design aims to optimize photonic structures to meet specific requirements across various applications. Inverse design has unlocked non-intuitive, high-dimensional design spaces, enabling the discovery of high-performance devices beyond heuristic or analytic methods. The adjoint method, which calculates gradients for all variables using just two simulations, enables efficient navigation of this complex space. However, many inverse-designed structures, while numerically plausible, are difficult to fabricate and sensitive to variations, limiting their practical use. The discrete nature with numerous local-optimal structures also pose significant optimization challenges, often causing gradient-based methods to converge on suboptimal designs. In this work, we formulate inverse design as a fabrication-restricted, discrete, probabilistic optimization problem and introduce BOSON-1, an end-to-end, variation-aware subspace optimization framework to address the challenges of manufacturability, robustness, and optimizability. To overcome optimization difficulty, we propose dense target-enhanced gradient flows to mitigate misleading local optima and introduce a conditional subspace optimization strategy to create high-dimensional tunnels to escape local optima. Furthermore, we significantly reduce the runtime associated with optimizing across exponential variation samples through an adaptive sampling-based robust optimization, ensuring both efficiency and variation robustness. On three representative photonic device benchmarks, our proposed inverse design methodology BOSON^-1 delivers fabricable structures and achieves the best convergence and performance under realistic variations, outperforming prior arts with 74.3% post-fabrication performance. We open-source our codes at https://github.com/ScopeX-ASU/BOSON. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08210v1-abstract-full').style.display = 'none'; document.getElementById('2411.08210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages. Accepted IEEE DATE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06899">arXiv:2411.06899</a> <span> [<a href="https://arxiv.org/pdf/2411.06899">pdf</a>, <a href="https://arxiv.org/format/2411.06899">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LongSafetyBench: Long-Context LLMs Struggle with Safety Issues </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+M">Mianqiu Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoran Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shaojun Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mozhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chenkun Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyu Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qipeng Guo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linyang Li</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhikai Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linlin Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yaqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06899v1-abstract-short" style="display: inline;"> With the development of large language models (LLMs), the sequence length of these models continues to increase, drawing significant attention to long-context language models. However, the evaluation of these models has been primarily limited to their capabilities, with a lack of research focusing on their safety. Existing work, such as ManyShotJailbreak, has to some extent demonstrated that long-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06899v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06899v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06899v1-abstract-full" style="display: none;"> With the development of large language models (LLMs), the sequence length of these models continues to increase, drawing significant attention to long-context language models. However, the evaluation of these models has been primarily limited to their capabilities, with a lack of research focusing on their safety. Existing work, such as ManyShotJailbreak, has to some extent demonstrated that long-context language models can exhibit safety concerns. However, the methods used are limited and lack comprehensiveness. In response, we introduce \textbf{LongSafetyBench}, the first benchmark designed to objectively and comprehensively evaluate the safety of long-context models. LongSafetyBench consists of 10 task categories, with an average length of 41,889 words. After testing eight long-context language models on LongSafetyBench, we found that existing models generally exhibit insufficient safety capabilities. The proportion of safe responses from most mainstream long-context LLMs is below 50\%. Moreover, models' safety performance in long-context scenarios does not always align with that in short-context scenarios. Further investigation revealed that long-context models tend to overlook harmful content within lengthy texts. We also proposed a simple yet effective solution, allowing open-source models to achieve performance comparable to that of top-tier closed-source models. We believe that LongSafetyBench can serve as a valuable benchmark for evaluating the safety capabilities of long-context language models. We hope that our work will encourage the broader community to pay attention to the safety of long-context models and contribute to the development of solutions to improve the safety of long-context LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06899v1-abstract-full').style.display = 'none'; document.getElementById('2411.06899v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06112">arXiv:2411.06112</a> <span> [<a href="https://arxiv.org/pdf/2411.06112">pdf</a>, <a href="https://arxiv.org/format/2411.06112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Interpret the Internal States of Recommendation Model with Sparse Autoencoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiayin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Weizhi Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06112v1-abstract-short" style="display: inline;"> Explainable recommendation systems are important to enhance transparency, accuracy, and fairness. Beyond result-level explanations, model-level interpretations can provide valuable insights that allow developers to optimize system designs and implement targeted improvements. However, most current approaches depend on specialized model designs, which often lack generalization capabilities. Given th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06112v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06112v1-abstract-full" style="display: none;"> Explainable recommendation systems are important to enhance transparency, accuracy, and fairness. Beyond result-level explanations, model-level interpretations can provide valuable insights that allow developers to optimize system designs and implement targeted improvements. However, most current approaches depend on specialized model designs, which often lack generalization capabilities. Given the various kinds of recommendation models, existing methods have limited ability to effectively interpret them. To address this issue, we propose RecSAE, an automatic, generalizable probing method for interpreting the internal states of Recommendation models with Sparse AutoEncoder. RecSAE serves as a plug-in module that does not affect original models during interpretations, while also enabling predictable modifications to their behaviors based on interpretation results. Firstly, we train an autoencoder with sparsity constraints to reconstruct internal activations of recommendation models, making the RecSAE latents more interpretable and monosemantic than the original neuron activations. Secondly, we automated the construction of concept dictionaries based on the relationship between latent activations and input item sequences. Thirdly, RecSAE validates these interpretations by predicting latent activations on new item sequences using the concept dictionary and deriving interpretation confidence scores from precision and recall. We demonstrate RecSAE's effectiveness on two datasets, identifying hundreds of highly interpretable concepts from pure ID-based models. Latent ablation studies further confirm that manipulating latent concepts produces corresponding changes in model output behavior, underscoring RecSAE's utility for both understanding and targeted tuning recommendation models. Code and data are publicly available at https://github.com/Alice1998/RecSAE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06112v1-abstract-full').style.display = 'none'; document.getElementById('2411.06112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06101">arXiv:2411.06101</a> <span> [<a href="https://arxiv.org/pdf/2411.06101">pdf</a>, <a href="https://arxiv.org/format/2411.06101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Detecting Reference Errors in Scientific Literature with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T+M">Tianmai M. Zhang</a>, <a href="/search/cs?searchtype=author&query=Abernethy%2C+N+F">Neil F. Abernethy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06101v1-abstract-short" style="display: inline;"> Reference errors, such as citation and quotation errors, are common in scientific papers. Such errors can result in the propagation of inaccurate information, but are difficult and time-consuming to detect, posing a significant challenge to scientific publishing. To support automatic detection of reference errors, this work evaluated the ability of large language models in OpenAI's GPT family to d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06101v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06101v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06101v1-abstract-full" style="display: none;"> Reference errors, such as citation and quotation errors, are common in scientific papers. Such errors can result in the propagation of inaccurate information, but are difficult and time-consuming to detect, posing a significant challenge to scientific publishing. To support automatic detection of reference errors, this work evaluated the ability of large language models in OpenAI's GPT family to detect quotation errors. Specifically, we prepared an expert-annotated, general-domain dataset of statement-reference pairs from journal articles. Large language models were evaluated in different settings with varying amounts of reference information provided by retrieval augmentation. Our results showed that large language models are able to detect erroneous citations with limited context and without fine-tuning. This study contributes to the growing literature that seeks to utilize artificial intelligence to assist in the writing, reviewing, and publishing of scientific papers. Potential avenues for further improvements in this task are also discussed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06101v1-abstract-full').style.display = 'none'; document.getElementById('2411.06101v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06004">arXiv:2411.06004</a> <span> [<a href="https://arxiv.org/pdf/2411.06004">pdf</a>, <a href="https://arxiv.org/format/2411.06004">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Do Data Center Network Metrics Predict Application-Facing Performance? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+B">Brian Chang</a>, <a href="/search/cs?searchtype=author&query=Mogul%2C+J+C">Jeffrey C. Mogul</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Akella%2C+A">Aditya Akella</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06004v1-abstract-short" style="display: inline;"> Applications that run in large-scale data center networks (DCNs) rely on the DCN's ability to deliver application requests in a performant manner. DCNs expose a complex design and operational space, and network designers and operators care how different options along this space affect application performance. One might run controlled experiments and measure the corresponding application-facing per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06004v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06004v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06004v1-abstract-full" style="display: none;"> Applications that run in large-scale data center networks (DCNs) rely on the DCN's ability to deliver application requests in a performant manner. DCNs expose a complex design and operational space, and network designers and operators care how different options along this space affect application performance. One might run controlled experiments and measure the corresponding application-facing performance, but such experiments become progressively infeasible at a large scale, and simulations risk yielding inaccurate or incomplete results. Instead, we show that we can predict application-facing performance through more easily measured network metrics. For example, network telemetry metrics (e.g., link utilization) can predict application-facing metrics (e.g., transfer latency). Through large-scale measurements of production networks, we study the correlation between the two types of metrics, and construct predictive, interpretable models that serve as a suggestive guideline to network designers and operators. We show that no single network metric is universally the best predictor (even though some prior work has focused on a single predictor). We found that simple linear models often have the lowest error, while queueing-based models are better in a few cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06004v1-abstract-full').style.display = 'none'; document.getElementById('2411.06004v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 (main body) + 5 (appendix) pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05902">arXiv:2411.05902</a> <span> [<a href="https://arxiv.org/pdf/2411.05902">pdf</a>, <a href="https://arxiv.org/format/2411.05902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Models in Vision: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Jing Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gongye Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lun Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chengyue Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Taiqiang Wu</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yao Mu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jinfa Huang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+C">Chaofan Tao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shen Yan</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Huaxiu Yao</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingpeng Kong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongxia Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sapiro%2C+G">Guillermo Sapiro</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+N">Ngai Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05902v1-abstract-short" style="display: inline;"> Autoregressive modeling has been a huge success in the field of natural language processing (NLP). Recently, autoregressive models have emerged as a significant area of focus in computer vision, where they excel in producing high-quality visual content. Autoregressive models in NLP typically operate on subword tokens. However, the representation strategy in computer vision can vary in different le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05902v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05902v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05902v1-abstract-full" style="display: none;"> Autoregressive modeling has been a huge success in the field of natural language processing (NLP). Recently, autoregressive models have emerged as a significant area of focus in computer vision, where they excel in producing high-quality visual content. Autoregressive models in NLP typically operate on subword tokens. However, the representation strategy in computer vision can vary in different levels, \textit{i.e.}, pixel-level, token-level, or scale-level, reflecting the diverse and hierarchical nature of visual data compared to the sequential structure of language. This survey comprehensively examines the literature on autoregressive models applied to vision. To improve readability for researchers from diverse research backgrounds, we start with preliminary sequence representation and modeling in vision. Next, we divide the fundamental frameworks of visual autoregressive models into three general sub-categories, including pixel-based, token-based, and scale-based models based on the strategy of representation. We then explore the interconnections between autoregressive models and other generative models. Furthermore, we present a multi-faceted categorization of autoregressive models in computer vision, including image generation, video generation, 3D generation, and multi-modal generation. We also elaborate on their applications in diverse domains, including emerging domains such as embodied AI and 3D medical AI, with about 250 related references. Finally, we highlight the current challenges to autoregressive models in vision with suggestions about potential research directions. We have also set up a Github repository to organize the papers included in this survey at: \url{https://github.com/ChaofanTao/Autoregressive-Models-in-Vision-Survey}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05902v1-abstract-full').style.display = 'none'; document.getElementById('2411.05902v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05833">arXiv:2411.05833</a> <span> [<a href="https://arxiv.org/pdf/2411.05833">pdf</a>, <a href="https://arxiv.org/format/2411.05833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Fully Automated Correlated Time Series Forecasting in Minutes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xinle Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xingjian Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dalin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chenjuan Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bin Yang</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+C+S">Christian S. Jensen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05833v1-abstract-short" style="display: inline;"> Societal and industrial infrastructures and systems increasingly leverage sensors that emit correlated time series. Forecasting of future values of such time series based on recorded historical values has important benefits. Automatically designed models achieve higher accuracy than manually designed models. Given a forecasting task, which includes a dataset and a forecasting horizon, automated de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05833v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05833v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05833v1-abstract-full" style="display: none;"> Societal and industrial infrastructures and systems increasingly leverage sensors that emit correlated time series. Forecasting of future values of such time series based on recorded historical values has important benefits. Automatically designed models achieve higher accuracy than manually designed models. Given a forecasting task, which includes a dataset and a forecasting horizon, automated design methods automatically search for an optimal forecasting model for the task in a manually designed search space, and then train the identified model using the dataset to enable the forecasting. Existing automated methods face three challenges. First, the search space is constructed by human experts, rending the methods only semi-automated and yielding search spaces prone to subjective biases. Second, it is time consuming to search for an optimal model. Third, training the identified model for a new task is also costly. These challenges limit the practicability of automated methods in real-world settings. To contend with the challenges, we propose a fully automated and highly efficient correlated time series forecasting framework where the search and training can be done in minutes. The framework includes a data-driven, iterative strategy to automatically prune a large search space to obtain a high-quality search space for a new forecasting task. It includes a zero-shot search strategy to efficiently identify the optimal model in the customized search space. And it includes a fast parameter adaptation strategy to accelerate the training of the identified model. Experiments on seven benchmark datasets offer evidence that the framework is capable of state-of-the-art accuracy and is much more efficient than existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05833v1-abstract-full').style.display = 'none'; document.getElementById('2411.05833v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by PVLDB 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04138">arXiv:2411.04138</a> <span> [<a href="https://arxiv.org/pdf/2411.04138">pdf</a>, <a href="https://arxiv.org/format/2411.04138">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NetworkGym: Reinforcement Learning Environments for Multi-Access Traffic Management in Network Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haider%2C+M">Momin Haider</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Ming Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Menglei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Arpit Gupta</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jing Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu-Xiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04138v1-abstract-short" style="display: inline;"> Mobile devices such as smartphones, laptops, and tablets can often connect to multiple access networks (e.g., Wi-Fi, LTE, and 5G) simultaneously. Recent advancements facilitate seamless integration of these connections below the transport layer, enhancing the experience for apps that lack inherent multi-path support. This optimization hinges on dynamically determining the traffic distribution acro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04138v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04138v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04138v1-abstract-full" style="display: none;"> Mobile devices such as smartphones, laptops, and tablets can often connect to multiple access networks (e.g., Wi-Fi, LTE, and 5G) simultaneously. Recent advancements facilitate seamless integration of these connections below the transport layer, enhancing the experience for apps that lack inherent multi-path support. This optimization hinges on dynamically determining the traffic distribution across networks for each device, a process referred to as \textit{multi-access traffic splitting}. This paper introduces \textit{NetworkGym}, a high-fidelity network environment simulator that facilitates generating multiple network traffic flows and multi-access traffic splitting. This simulator facilitates training and evaluating different RL-based solutions for the multi-access traffic splitting problem. Our initial explorations demonstrate that the majority of existing state-of-the-art offline RL algorithms (e.g. CQL) fail to outperform certain hand-crafted heuristic policies on average. This illustrates the urgent need to evaluate offline RL algorithms against a broader range of benchmarks, rather than relying solely on popular ones such as D4RL. We also propose an extension to the TD3+BC algorithm, named Pessimistic TD3 (PTD3), and demonstrate that it outperforms many state-of-the-art offline RL algorithms. PTD3's behavioral constraint mechanism, which relies on value-function pessimism, is theoretically motivated and relatively simple to implement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04138v1-abstract-full').style.display = 'none'; document.getElementById('2411.04138v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS (Datasets and Benchmarks)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03845">arXiv:2411.03845</a> <span> [<a href="https://arxiv.org/pdf/2411.03845">pdf</a>, <a href="https://arxiv.org/format/2411.03845">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reconsidering the Performance of GAE in Link Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+W">Weishuo Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanbo Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Muhan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03845v1-abstract-short" style="display: inline;"> Various graph neural networks (GNNs) with advanced training techniques and model designs have been proposed for link prediction tasks. However, outdated baseline models may lead to an overestimation of the benefits provided by these novel approaches. To address this, we systematically investigate the potential of Graph Autoencoders (GAE) by meticulously tuning hyperparameters and utilizing the tri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03845v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03845v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03845v1-abstract-full" style="display: none;"> Various graph neural networks (GNNs) with advanced training techniques and model designs have been proposed for link prediction tasks. However, outdated baseline models may lead to an overestimation of the benefits provided by these novel approaches. To address this, we systematically investigate the potential of Graph Autoencoders (GAE) by meticulously tuning hyperparameters and utilizing the trick of orthogonal embedding and linear propagation. Our findings reveal that a well-optimized GAE can match the performance of more complex models while offering greater computational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03845v1-abstract-full').style.display = 'none'; document.getElementById('2411.03845v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03766">arXiv:2411.03766</a> <span> [<a href="https://arxiv.org/pdf/2411.03766">pdf</a>, <a href="https://arxiv.org/format/2411.03766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Number Cookbook: Number Understanding of Language Models and How to Improve It </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haotong Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yi Hu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+S">Shijia Kang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhouchen Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Muhan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03766v1-abstract-short" style="display: inline;"> Large language models (LLMs) can solve an increasing number of complex reasoning tasks while making surprising mistakes in basic numerical understanding and processing (such as 9.11 > 9.9). The latter ability is essential for tackling complex arithmetic and mathematical problems and serves as a foundation for most reasoning tasks, but previous work paid little attention to it or only discussed sev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03766v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03766v1-abstract-full" style="display: none;"> Large language models (LLMs) can solve an increasing number of complex reasoning tasks while making surprising mistakes in basic numerical understanding and processing (such as 9.11 > 9.9). The latter ability is essential for tackling complex arithmetic and mathematical problems and serves as a foundation for most reasoning tasks, but previous work paid little attention to it or only discussed several restricted tasks (like integer addition). In this paper, we comprehensively investigate the numerical understanding and processing ability (NUPA) of LLMs. Firstly, we introduce a benchmark covering four common numerical representations and 17 distinct numerical tasks in four major categories, resulting in 41 meaningful combinations in total. These tasks are derived from primary and secondary education curricula, encompassing nearly all everyday numerical understanding and processing scenarios, and the rules of these tasks are very simple and clear. Through the benchmark, we find that current LLMs fail frequently in many of the tasks. To study the problem, we train small models with existing and potential techniques for enhancing NUPA (such as special tokenizers, PEs, and number formats), comprehensively evaluating their effectiveness using our testbed. We also finetune practical-scale LLMs on our proposed NUPA tasks and find that 1) naive finetuning can improve NUPA a lot on many but not all tasks, and 2) surprisingly, techniques designed to enhance NUPA prove ineffective for finetuning pretrained models. We further explore the impact of chain-of-thought techniques on NUPA. Our work takes a preliminary step towards understanding and improving NUPA of LLMs. Our benchmark and code are released at https://github.com/GraphPKU/number_cookbook. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03766v1-abstract-full').style.display = 'none'; document.getElementById('2411.03766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03365">arXiv:2411.03365</a> <span> [<a href="https://arxiv.org/pdf/2411.03365">pdf</a>, <a href="https://arxiv.org/format/2411.03365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Real-Time Threat Detection in 5G Networks: A Self-Attention RNN Autoencoder Approach for Spectral Intrusion Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kouchaki%2C+M">Mohammadreza Kouchaki</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minglong Zhang</a>, <a href="/search/cs?searchtype=author&query=Abdalla%2C+A+S">Aly S. Abdalla</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+G">Guangchen Lan</a>, <a href="/search/cs?searchtype=author&query=Brinton%2C+C+G">Christopher G. Brinton</a>, <a href="/search/cs?searchtype=author&query=Marojevic%2C+V">Vuk Marojevic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03365v1-abstract-short" style="display: inline;"> In the rapidly evolving landscape of 5G technology, safeguarding Radio Frequency (RF) environments against sophisticated intrusions is paramount, especially in dynamic spectrum access and management. This paper presents an enhanced experimental model that integrates a self-attention mechanism with a Recurrent Neural Network (RNN)-based autoencoder for the detection of anomalous spectral activities… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03365v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03365v1-abstract-full" style="display: none;"> In the rapidly evolving landscape of 5G technology, safeguarding Radio Frequency (RF) environments against sophisticated intrusions is paramount, especially in dynamic spectrum access and management. This paper presents an enhanced experimental model that integrates a self-attention mechanism with a Recurrent Neural Network (RNN)-based autoencoder for the detection of anomalous spectral activities in 5G networks at the waveform level. Our approach, grounded in time-series analysis, processes in-phase and quadrature (I/Q) samples to identify irregularities that could indicate potential jamming attacks. The model's architecture, augmented with a self-attention layer, extends the capabilities of RNN autoencoders, enabling a more nuanced understanding of temporal dependencies and contextual relationships within the RF spectrum. Utilizing a simulated 5G Radio Access Network (RAN) test-bed constructed with srsRAN 5G and Software Defined Radios (SDRs), we generated a comprehensive stream of data that reflects real-world RF spectrum conditions and attack scenarios. The model is trained to reconstruct standard signal behavior, establishing a normative baseline against which deviations, indicative of security threats, are identified. The proposed architecture is designed to balance between detection precision and computational efficiency, so the LSTM network, enriched with self-attention, continues to optimize for minimal execution latency and power consumption. Conducted on a real-world SDR-based testbed, our results demonstrate the model's improved performance and accuracy in threat detection. Keywords: self-attention, real-time intrusion detection, RNN autoencoder, Transformer architecture, LSTM, time series anomaly detection, 5G Security, spectrum access security. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03365v1-abstract-full').style.display = 'none'; document.getElementById('2411.03365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This article has been accepted for publication in WiOpt 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03028">arXiv:2411.03028</a> <span> [<a href="https://arxiv.org/pdf/2411.03028">pdf</a>, <a href="https://arxiv.org/format/2411.03028">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Graph Agnostic Causal Bayesian Optimisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mukherjee%2C+S">Sumantrak Mukherjee</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Flaxman%2C+S">Seth Flaxman</a>, <a href="/search/cs?searchtype=author&query=Vollmer%2C+S+J">Sebastian Josef Vollmer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03028v1-abstract-short" style="display: inline;"> We study the problem of globally optimising a target variable of an unknown causal graph on which a sequence of soft or hard interventions can be performed. The problem of optimising the target variable associated with a causal graph is formalised as Causal Bayesian Optimisation (CBO). We study the CBO problem under the cumulative regret objective with unknown causal graphs for two settings, namel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03028v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03028v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03028v1-abstract-full" style="display: none;"> We study the problem of globally optimising a target variable of an unknown causal graph on which a sequence of soft or hard interventions can be performed. The problem of optimising the target variable associated with a causal graph is formalised as Causal Bayesian Optimisation (CBO). We study the CBO problem under the cumulative regret objective with unknown causal graphs for two settings, namely structural causal models with hard interventions and function networks with soft interventions. We propose Graph Agnostic Causal Bayesian Optimisation (GACBO), an algorithm that actively discovers the causal structure that contributes to achieving optimal rewards. GACBO seeks to balance exploiting the actions that give the best rewards against exploring the causal structures and functions. To the best of our knowledge, our work is the first to study causal Bayesian optimization with cumulative regret objectives in scenarios where the graph is unknown or partially known. We show our proposed algorithm outperforms baselines in simulated experiments and real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03028v1-abstract-full').style.display = 'none'; document.getElementById('2411.03028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02785">arXiv:2411.02785</a> <span> [<a href="https://arxiv.org/pdf/2411.02785">pdf</a>, <a href="https://arxiv.org/format/2411.02785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Stochastic Monkeys at Play: Random Augmentations Cheaply Break LLM Safety Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vega%2C+J">Jason Vega</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junsheng Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gaokai Zhang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+H">Hangoo Kang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+G">Gagandeep Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02785v1-abstract-short" style="display: inline;"> Safety alignment of Large Language Models (LLMs) has recently become a critical objective of model developers. In response, a growing body of work has been investigating how safety alignment can be bypassed through various jailbreaking methods, such as adversarial attacks. However, these jailbreak methods can be rather costly or involve a non-trivial amount of creativity and effort, introducing th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02785v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02785v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02785v1-abstract-full" style="display: none;"> Safety alignment of Large Language Models (LLMs) has recently become a critical objective of model developers. In response, a growing body of work has been investigating how safety alignment can be bypassed through various jailbreaking methods, such as adversarial attacks. However, these jailbreak methods can be rather costly or involve a non-trivial amount of creativity and effort, introducing the assumption that malicious users are high-resource or sophisticated. In this paper, we study how simple random augmentations to the input prompt affect safety alignment effectiveness in state-of-the-art LLMs, such as Llama 3 and Qwen 2. We perform an in-depth evaluation of 17 different models and investigate the intersection of safety under random augmentations with multiple dimensions: augmentation type, model size, quantization, fine-tuning-based defenses, and decoding strategies (e.g., sampling temperature). We show that low-resource and unsophisticated attackers, i.e. $\textit{stochastic monkeys}$, can significantly improve their chances of bypassing alignment with just 25 random augmentations per prompt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02785v1-abstract-full').style.display = 'none'; document.getElementById('2411.02785v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under peer review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01169">arXiv:2411.01169</a> <span> [<a href="https://arxiv.org/pdf/2411.01169">pdf</a>, <a href="https://arxiv.org/format/2411.01169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TKDE.2024.3397683">10.1109/TKDE.2024.3397683 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Bi-Level Graph Structure Learning for Next POI Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shu Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanqiao Zhu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xiang Tao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengdi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01169v1-abstract-short" style="display: inline;"> Next point-of-interest (POI) recommendation aims to predict a user's next destination based on sequential check-in history and a set of POI candidates. Graph neural networks (GNNs) have demonstrated a remarkable capability in this endeavor by exploiting the extensive global collaborative signals present among POIs. However, most of the existing graph-based approaches construct graph structures bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01169v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01169v1-abstract-full" style="display: none;"> Next point-of-interest (POI) recommendation aims to predict a user's next destination based on sequential check-in history and a set of POI candidates. Graph neural networks (GNNs) have demonstrated a remarkable capability in this endeavor by exploiting the extensive global collaborative signals present among POIs. However, most of the existing graph-based approaches construct graph structures based on pre-defined heuristics, failing to consider inherent hierarchical structures of POI features such as geographical locations and visiting peaks, or suffering from noisy and incomplete structures in graphs. To address the aforementioned issues, this paper presents a novel Bi-level Graph Structure Learning (BiGSL) for next POI recommendation. BiGSL first learns a hierarchical graph structure to capture the fine-to-coarse connectivity between POIs and prototypes, and then uses a pairwise learning module to dynamically infer relationships between POI pairs and prototype pairs. Based on the learned bi-level graphs, our model then employs a multi-relational graph network that considers both POI- and prototype-level neighbors, resulting in improved POI representations. Our bi-level structure learning scheme is more robust to data noise and incompleteness, and improves the exploration ability for recommendation by alleviating sparsity issues. Experimental results on three real-world datasets demonstrate the superiority of our model over existing state-of-the-art methods, with a significant improvement in recommendation accuracy and exploration performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01169v1-abstract-full').style.display = 'none'; document.getElementById('2411.01169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Knowledge and Data Engineering</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Knowledge and Data Engineering, vol. 36, no. 11, pp. 5695-5708, Nov. 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00969">arXiv:2411.00969</a> <span> [<a href="https://arxiv.org/pdf/2411.00969">pdf</a>, <a href="https://arxiv.org/format/2411.00969">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Magnitude Pruning of Large Pretrained Transformer Models with a Mixture Gaussian Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yan Sun</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+F">Faming Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00969v1-abstract-short" style="display: inline;"> Large pretrained transformer models have revolutionized modern AI applications with their state-of-the-art performance in natural language processing (NLP). However, their substantial parameter count poses challenges for real-world deployment. To address this, researchers often reduce model size by pruning parameters based on their magnitude or sensitivity. Previous research has demonstrated the l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00969v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00969v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00969v1-abstract-full" style="display: none;"> Large pretrained transformer models have revolutionized modern AI applications with their state-of-the-art performance in natural language processing (NLP). However, their substantial parameter count poses challenges for real-world deployment. To address this, researchers often reduce model size by pruning parameters based on their magnitude or sensitivity. Previous research has demonstrated the limitations of magnitude pruning, especially in the context of transfer learning for modern NLP tasks. In this paper, we introduce a new magnitude-based pruning algorithm called mixture Gaussian prior pruning (MGPP), which employs a mixture Gaussian prior for regularization. MGPP prunes non-expressive weights under the guidance of the mixture Gaussian prior, aiming to retain the model's expressive capability. Extensive evaluations across various NLP tasks, including natural language understanding, question answering, and natural language generation, demonstrate the superiority of MGPP over existing pruning methods, particularly in high sparsity settings. Additionally, we provide a theoretical justification for the consistency of the sparse transformer, shedding light on the effectiveness of the proposed pruning method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00969v1-abstract-full').style.display = 'none'; document.getElementById('2411.00969v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00585">arXiv:2411.00585</a> <span> [<a href="https://arxiv.org/pdf/2411.00585">pdf</a>, <a href="https://arxiv.org/format/2411.00585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Bias in Large Language Models during Role-Playing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyue Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenpeng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J+M">Jie M. Zhang</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+Y">Yiling Lou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianlin Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Weisong Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuanzhe Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00585v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have become foundational in modern language-driven applications, profoundly influencing daily life. A critical technique in leveraging their potential is role-playing, where LLMs simulate diverse roles to enhance their real-world utility. However, while research has highlighted the presence of social biases in LLM outputs, it remains unclear whether and to what extent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00585v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00585v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have become foundational in modern language-driven applications, profoundly influencing daily life. A critical technique in leveraging their potential is role-playing, where LLMs simulate diverse roles to enhance their real-world utility. However, while research has highlighted the presence of social biases in LLM outputs, it remains unclear whether and to what extent these biases emerge during role-playing scenarios. In this paper, we introduce BiasLens, a fairness testing framework designed to systematically expose biases in LLMs during role-playing. Our approach uses LLMs to generate 550 social roles across a comprehensive set of 11 demographic attributes, producing 33,000 role-specific questions targeting various forms of bias. These questions, spanning Yes/No, multiple-choice, and open-ended formats, are designed to prompt LLMs to adopt specific roles and respond accordingly. We employ a combination of rule-based and LLM-based strategies to identify biased responses, rigorously validated through human evaluation. Using the generated questions as the benchmark, we conduct extensive evaluations of six advanced LLMs released by OpenAI, Mistral AI, Meta, Alibaba, and DeepSeek. Our benchmark reveals 72,716 biased responses across the studied LLMs, with individual models yielding between 7,754 and 16,963 biased responses, underscoring the prevalence of bias in role-playing contexts. To support future research, we have publicly released the benchmark, along with all scripts and experimental results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00585v1-abstract-full').style.display = 'none'; document.getElementById('2411.00585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00473">arXiv:2411.00473</a> <span> [<a href="https://arxiv.org/pdf/2411.00473">pdf</a>, <a href="https://arxiv.org/format/2411.00473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Synergistic Interplay of Large Language Model and Digital Twin for Autonomous Optical Networks: Field Demonstrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuchen Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Anni Zhou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yan Shi</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shikui Shen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiongyan Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Danshi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00473v1-abstract-short" style="display: inline;"> The development of large language models (LLM) has revolutionized various fields and is anticipated to drive the advancement of autonomous systems. In the context of autonomous optical networks, creating a high-level cognitive agent in the control layer remains a challenge. However, LLM is primarily developed for natural language processing tasks, rendering them less effective in predicting the ph… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00473v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00473v1-abstract-full" style="display: none;"> The development of large language models (LLM) has revolutionized various fields and is anticipated to drive the advancement of autonomous systems. In the context of autonomous optical networks, creating a high-level cognitive agent in the control layer remains a challenge. However, LLM is primarily developed for natural language processing tasks, rendering them less effective in predicting the physical dynamics of optical communications. Moreover, optical networks demand rigorous stability, where direct deployment of strategies generated from LLM poses safety concerns. In this paper, a digital twin (DT)-enhanced LLM scheme is proposed to facilitate autonomous optical networks. By leveraging monitoring data and advanced models, the DT of optical networks can accurately characterize their physical dynamics, furnishing LLMs with dynamic-updated information for reliable decision-making. Prior to deployment, the generated strategies from LLM can be pre-verified in the DT platform, which also provides feedback to the LLM for further refinement of strategies. The synergistic interplay between DT and LLM for autonomous optical networks is demonstrated through three scenarios: performance optimization under dynamic loadings in an experimental C+L-band long-haul transmission link, protection switching for device upgrading in a field-deployed six-node mesh network, and performance recovery after fiber cuts in a field-deployed C+L-band transmission link. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00473v1-abstract-full').style.display = 'none'; document.getElementById('2411.00473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages,6 figures; Accepted by IEEE Communications Magazine, Open call</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00331">arXiv:2411.00331</a> <span> [<a href="https://arxiv.org/pdf/2411.00331">pdf</a>, <a href="https://arxiv.org/format/2411.00331">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Beyond Utility: Evaluating LLM as Recommender </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chumeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiayin Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Weizhi Ma</a>, <a href="/search/cs?searchtype=author&query=Clarke%2C+C+L+A">Charles L. A. Clarke</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chuhan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00331v1-abstract-short" style="display: inline;"> With the rapid development of Large Language Models (LLMs), recent studies employed LLMs as recommenders to provide personalized information services for distinct users. Despite efforts to improve the accuracy of LLM-based recommendation models, relatively little attention is paid to beyond-utility dimensions. Moreover, there are unique evaluation aspects of LLM-based recommendation models, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00331v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00331v1-abstract-full" style="display: none;"> With the rapid development of Large Language Models (LLMs), recent studies employed LLMs as recommenders to provide personalized information services for distinct users. Despite efforts to improve the accuracy of LLM-based recommendation models, relatively little attention is paid to beyond-utility dimensions. Moreover, there are unique evaluation aspects of LLM-based recommendation models, which have been largely ignored. To bridge this gap, we explore four new evaluation dimensions and propose a multidimensional evaluation framework. The new evaluation dimensions include: 1) history length sensitivity, 2) candidate position bias, 3) generation-involved performance, and 4) hallucinations. All four dimensions have the potential to impact performance, but are largely unnecessary for consideration in traditional systems. Using this multidimensional evaluation framework, along with traditional aspects, we evaluate the performance of seven LLM-based recommenders, with three prompting strategies, comparing them with six traditional models on both ranking and re-ranking tasks on four datasets. We find that LLMs excel at handling tasks with prior knowledge and shorter input histories in the ranking setting, and perform better in the re-ranking setting, beating traditional models across multiple dimensions. However, LLMs exhibit substantial candidate position bias issues, and some models hallucinate non-existent items much more often than others. We intend our evaluation framework and observations to benefit future research on the use of LLMs as recommenders. The code and data are available at https://github.com/JiangDeccc/EvaLLMasRecommender. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00331v1-abstract-full').style.display = 'none'; document.getElementById('2411.00331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00006">arXiv:2411.00006</a> <span> [<a href="https://arxiv.org/pdf/2411.00006">pdf</a>, <a href="https://arxiv.org/format/2411.00006">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Personality-Guided Code Generation Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yaoqi Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenpeng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J+M">Jie M. Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00006v1-abstract-short" style="display: inline;"> Code generation, the automatic creation of source code from natural language descriptions, has garnered significant attention due to its potential to streamline software development. Inspired by research that links task-personality alignment with improved development outcomes, we conduct an empirical study on personality-guided code generation using large language models (LLMs). Specifically, we i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00006v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00006v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00006v1-abstract-full" style="display: none;"> Code generation, the automatic creation of source code from natural language descriptions, has garnered significant attention due to its potential to streamline software development. Inspired by research that links task-personality alignment with improved development outcomes, we conduct an empirical study on personality-guided code generation using large language models (LLMs). Specifically, we investigate how emulating personality traits appropriate to the coding tasks affects LLM performance. We extensively evaluate this approach using seven widely adopted LLMs across four representative datasets. Our results show that personality guidance significantly enhances code generation accuracy, with improved pass rates in 23 out of 28 LLM-dataset combinations. Notably, in 11 cases, the improvement exceeds 5%, and in 5 instances, it surpasses 10%, with the highest gain reaching 12.9%. Additionally, personality guidance can be easily integrated with other prompting strategies to further boost performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00006v1-abstract-full').style.display = 'none'; document.getElementById('2411.00006v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23854">arXiv:2410.23854</a> <span> [<a href="https://arxiv.org/pdf/2410.23854">pdf</a>, <a href="https://arxiv.org/format/2410.23854">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Airway Labeling Meets Clinical Applications: Reflecting Topology Consistency and Outliers via Learnable Attentions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenyu Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minghui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yun Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23854v1-abstract-short" style="display: inline;"> Accurate airway anatomical labeling is crucial for clinicians to identify and navigate complex bronchial structures during bronchoscopy. Automatic airway anatomical labeling is challenging due to significant individual variability and anatomical variations. Previous methods are prone to generate inconsistent predictions, which is harmful for preoperative planning and intraoperative navigation. Thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23854v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23854v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23854v1-abstract-full" style="display: none;"> Accurate airway anatomical labeling is crucial for clinicians to identify and navigate complex bronchial structures during bronchoscopy. Automatic airway anatomical labeling is challenging due to significant individual variability and anatomical variations. Previous methods are prone to generate inconsistent predictions, which is harmful for preoperative planning and intraoperative navigation. This paper aims to address these challenges by proposing a novel method that enhances topological consistency and improves the detection of abnormal airway branches. We propose a novel approach incorporating two modules: the Soft Subtree Consistency (SSC) and the Abnormal Branch Saliency (ABS). The SSC module constructs a soft subtree to capture clinically relevant topological relationships, allowing for flexible feature aggregation within and across subtrees. The ABS module facilitates the interaction between node features and prototypes to distinguish abnormal branches, preventing the erroneous aggregation of features between normal and abnormal nodes. Evaluated on a challenging dataset characterized by severe airway distortion and atrophy, our method achieves superior performance compared to state-of-the-art approaches. Specifically, it attains a 91.4% accuracy at the segmental level and an 83.7% accuracy at the subsegmental level, representing a 1.4% increase in subsegmental accuracy and a 3.1% increase in topological consistency. Notably, the method demonstrates reliable performance in cases with disease-induced airway deformities, ensuring consistent and accurate labeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23854v1-abstract-full').style.display = 'none'; document.getElementById('2410.23854v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23091">arXiv:2410.23091</a> <span> [<a href="https://arxiv.org/pdf/2410.23091">pdf</a>, <a href="https://arxiv.org/format/2410.23091">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for Adversarial Defense </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingkun Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+K">Keping Bi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Quanrun Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiafeng Guo</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xueqi Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23091v4-abstract-short" style="display: inline;"> Despite ongoing efforts to defend neural classifiers from adversarial attacks, they remain vulnerable, especially to unseen attacks. In contrast, humans are difficult to be cheated by subtle manipulations, since we make judgments only based on essential factors. Inspired by this observation, we attempt to model label generation with essential label-causative factors and incorporate label-non-causa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23091v4-abstract-full').style.display = 'inline'; document.getElementById('2410.23091v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23091v4-abstract-full" style="display: none;"> Despite ongoing efforts to defend neural classifiers from adversarial attacks, they remain vulnerable, especially to unseen attacks. In contrast, humans are difficult to be cheated by subtle manipulations, since we make judgments only based on essential factors. Inspired by this observation, we attempt to model label generation with essential label-causative factors and incorporate label-non-causative factors to assist data generation. For an adversarial example, we aim to discriminate the perturbations as non-causative factors and make predictions only based on the label-causative factors. Concretely, we propose a casual diffusion model (CausalDiff) that adapts diffusion models for conditional data generation and disentangles the two types of casual factors by learning towards a novel casual information bottleneck objective. Empirically, CausalDiff has significantly outperformed state-of-the-art defense methods on various unseen attacks, achieving an average robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on CIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition Benchmark). The code is available at \href{https://github.com/CAS-AISafetyBasicResearchGroup/CausalDiff}{https://github.com/CAS-AISafetyBasicResearchGroup/CausalDiff} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23091v4-abstract-full').style.display = 'none'; document.getElementById('2410.23091v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21801">arXiv:2410.21801</a> <span> [<a href="https://arxiv.org/pdf/2410.21801">pdf</a>, <a href="https://arxiv.org/format/2410.21801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> PerSRV: Personalized Sticker Retrieval with Vision-Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chee%2C+H+E+M">Heng Er Metilda Chee</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiayin Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhiqiang Guo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Weizhi Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21801v1-abstract-short" style="display: inline;"> Instant Messaging is a popular means for daily communication, allowing users to send text and stickers. As the saying goes, "a picture is worth a thousand words", so developing an effective sticker retrieval technique is crucial for enhancing user experience. However, existing sticker retrieval methods rely on labeled data to interpret stickers, and general-purpose Vision-Language Models (VLMs) of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21801v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21801v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21801v1-abstract-full" style="display: none;"> Instant Messaging is a popular means for daily communication, allowing users to send text and stickers. As the saying goes, "a picture is worth a thousand words", so developing an effective sticker retrieval technique is crucial for enhancing user experience. However, existing sticker retrieval methods rely on labeled data to interpret stickers, and general-purpose Vision-Language Models (VLMs) often struggle to capture the unique semantics of stickers. Additionally, relevant-based sticker retrieval methods lack personalization, creating a gap between diverse user expectations and retrieval results. To address these, we propose the Personalized Sticker Retrieval with Vision-Language Model framework, namely PerSRV, structured into offline calculations and online processing modules. The online retrieval part follows the paradigm of relevant recall and personalized ranking, supported by the offline pre-calculation parts, which are sticker semantic understanding, utility evaluation and personalization modules. Firstly, for sticker-level semantic understanding, we supervised fine-tuned LLaVA-1.5-7B to generate human-like sticker semantics, complemented by textual content extracted from figures and historical interaction queries. Secondly, we investigate three crowd-sourcing metrics for sticker utility evaluation. Thirdly, we cluster style centroids based on users' historical interactions to achieve personal preference modeling. Finally, we evaluate our proposed PerSRV method on a public sticker retrieval dataset from WeChat, containing 543,098 candidates and 12,568 interactions. Experimental results show that PerSRV significantly outperforms existing methods in multi-modal sticker retrieval. Additionally, our fine-tuned VLM delivers notable improvements in sticker semantic understandings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21801v1-abstract-full').style.display = 'none'; document.getElementById('2410.21801v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21445">arXiv:2410.21445</a> <span> [<a href="https://arxiv.org/pdf/2410.21445">pdf</a>, <a href="https://arxiv.org/format/2410.21445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> TALE-teller: Tendon-Actuated Linked Element Robotic Testbed for Investigating Tail Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M+J">Margaret J. Zhang</a>, <a href="/search/cs?searchtype=author&query=Pradhan%2C+A+A">Anvay A. Pradhan</a>, <a href="/search/cs?searchtype=author&query=Brei%2C+Z">Zachary Brei</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+X">Xiangyun Bu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xiang Ye</a>, <a href="/search/cs?searchtype=author&query=Jamal%2C+S">Saima Jamal</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+C+W">Chae Woo Lim</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaonan Huang</a>, <a href="/search/cs?searchtype=author&query=Moore%2C+T+Y">Talia Y. Moore</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21445v1-abstract-short" style="display: inline;"> Tails serve various functions in both robotics and biology, including expression, grasping, and defense. The vertebrate tails associated with these functions exhibit diverse patterns of vertebral lengths, but the precise mechanisms linking form to function have not yet been established. Vertebrate tails are complex musculoskeletal structures, making both direct experimentation and computational mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21445v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21445v1-abstract-full" style="display: none;"> Tails serve various functions in both robotics and biology, including expression, grasping, and defense. The vertebrate tails associated with these functions exhibit diverse patterns of vertebral lengths, but the precise mechanisms linking form to function have not yet been established. Vertebrate tails are complex musculoskeletal structures, making both direct experimentation and computational modeling challenging. This paper presents Tendon-Actuated Linked-Element (TALE), a modular robotic test bed to explore how tail morphology influences function. By varying 3D printed bones, silicone joints, and tendon configurations, TALE can match the morphology of extant, extinct, and even theoretical tails. We first characterized the stiffness of our joint design empirically and in simulation before testing the hypothesis that tails with different vertebral proportions curve differently. We then compared the maximum bending state of two common vertebrate proportions and one theoretical morphology. Uniform bending of joints with different vertebral proportions led to substantial differences in the location of the tail tip, suggesting a significant influence on overall tail function. Future studies can introduce more complex morphologies to establish the mechanisms of diverse tail functions. With this foundational knowledge, we will isolate the key features underlying tail function to inform the design for robotic tails. Images and videos can be found on TALE's project page: https://www.embirlab.com/tale. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21445v1-abstract-full').style.display = 'none'; document.getElementById('2410.21445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21276">arXiv:2410.21276</a> <span> [<a href="https://arxiv.org/pdf/2410.21276">pdf</a>, <a href="https://arxiv.org/format/2410.21276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GPT-4o System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Hurst%2C+A">Aaron Hurst</a>, <a href="/search/cs?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&query=Goucher%2C+A+P">Adam P. Goucher</a>, <a href="/search/cs?searchtype=author&query=Perelman%2C+A">Adam Perelman</a>, <a href="/search/cs?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+A">Aidan Clark</a>, <a href="/search/cs?searchtype=author&query=Ostrow%2C+A">AJ Ostrow</a>, <a href="/search/cs?searchtype=author&query=Welihinda%2C+A">Akila Welihinda</a>, <a href="/search/cs?searchtype=author&query=Hayes%2C+A">Alan Hayes</a>, <a href="/search/cs?searchtype=author&query=Radford%2C+A">Alec Radford</a>, <a href="/search/cs?searchtype=author&query=M%C4%85dry%2C+A">Aleksander M膮dry</a>, <a href="/search/cs?searchtype=author&query=Baker-Whitcomb%2C+A">Alex Baker-Whitcomb</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Borzunov%2C+A">Alex Borzunov</a>, <a href="/search/cs?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&query=Chow%2C+A">Alex Chow</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alex Kirillov</a>, <a href="/search/cs?searchtype=author&query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/cs?searchtype=author&query=Paino%2C+A">Alex Paino</a>, <a href="/search/cs?searchtype=author&query=Renzin%2C+A">Alex Renzin</a>, <a href="/search/cs?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/cs?searchtype=author&query=Christakis%2C+A">Alexi Christakis</a> , et al. (395 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21276v1-abstract-short" style="display: inline;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 mil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21276v1-abstract-full" style="display: none;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o's capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we've implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o's text and vision capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'none'; document.getElementById('2410.21276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20814">arXiv:2410.20814</a> <span> [<a href="https://arxiv.org/pdf/2410.20814">pdf</a>, <a href="https://arxiv.org/format/2410.20814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> NewTerm: Benchmarking Real-Time New Terms for Large Language Models with Annual Updates </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hexuan Deng</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+W">Wenxiang Jiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuebo Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhaopeng Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20814v1-abstract-short" style="display: inline;"> Despite their remarkable abilities in various tasks, large language models (LLMs) still struggle with real-time information (e.g., new facts and terms) due to the knowledge cutoff in their development process. However, existing benchmarks focus on outdated content and limited fields, facing difficulties in real-time updating and leaving new terms unexplored. To address this problem, we propose an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20814v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20814v1-abstract-full" style="display: none;"> Despite their remarkable abilities in various tasks, large language models (LLMs) still struggle with real-time information (e.g., new facts and terms) due to the knowledge cutoff in their development process. However, existing benchmarks focus on outdated content and limited fields, facing difficulties in real-time updating and leaving new terms unexplored. To address this problem, we propose an adaptive benchmark, NewTerm, for real-time evaluation of new terms. We design a highly automated construction method to ensure high-quality benchmark construction with minimal human effort, allowing flexible updates for real-time information. Empirical results on various LLMs demonstrate over 20% performance reduction caused by new terms. Additionally, while updates to the knowledge cutoff of LLMs can cover some of the new terms, they are unable to generalize to more distant new terms. We also analyze which types of terms are more challenging and why LLMs struggle with new terms, paving the way for future research. Finally, we construct NewTerm 2022 and 2023 to evaluate the new terms updated each year and will continue updating annually. The benchmark and codes can be found at https://github.com/hexuandeng/NewTerm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20814v1-abstract-full').style.display = 'none'; document.getElementById('2410.20814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024 Datasets and Benchmarks Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20436">arXiv:2410.20436</a> <span> [<a href="https://arxiv.org/pdf/2410.20436">pdf</a>, <a href="https://arxiv.org/format/2410.20436">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CoralSCOP-LAT: Labeling and Analyzing Tool for Coral Reef Images with Dense Mask </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wong%2C+Y">Yuk-Kwan Wong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Ziqiang Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingzhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Suggett%2C+D">David Suggett</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+S">Sai-Kit Yeung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20436v1-abstract-short" style="display: inline;"> Images of coral reefs provide invaluable information, which is essentially critical for surveying and monitoring the coral reef ecosystems. Robust and precise identification of coral reef regions within surveying imagery is paramount for assessing coral coverage, spatial distribution, and other statistical analyses. However, existing coral reef analytical approaches mainly focus on sparse points s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20436v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20436v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20436v1-abstract-full" style="display: none;"> Images of coral reefs provide invaluable information, which is essentially critical for surveying and monitoring the coral reef ecosystems. Robust and precise identification of coral reef regions within surveying imagery is paramount for assessing coral coverage, spatial distribution, and other statistical analyses. However, existing coral reef analytical approaches mainly focus on sparse points sampled from the whole imagery, which are highly subject to the sampling density and cannot accurately express the coral ambulance. Meanwhile, the analysis is both time-consuming and labor-intensive, and it is also limited to coral biologists. In this work, we propose CoralSCOP-LAT, an automatic and semi-automatic coral reef labeling and analysis tool, specially designed to segment coral reef regions (dense pixel masks) in coral reef images, significantly promoting analysis proficiency and accuracy. CoralSCOP-LAT leverages the advanced coral reef foundation model to accurately delineate coral regions, supporting dense coral reef analysis and reducing the dependency on manual annotation. The proposed CoralSCOP-LAT surpasses the existing tools by a large margin from analysis efficiency, accuracy, and flexibility. We perform comprehensive evaluations from various perspectives and the comparison demonstrates that CoralSCOP-LAT not only accelerates the coral reef analysis but also improves accuracy in coral segmentation and analysis. Our CoralSCOP-LAT, as the first dense coral reef analysis tool in the market, facilitates repeated large-scale coral reef monitoring analysis, contributing to more informed conservation efforts and sustainable management of coral reef ecosystems. Our tool will be available at https://coralscop.hkustvgd.com/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20436v1-abstract-full').style.display = 'none'; document.getElementById('2410.20436v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The coral reef labeling and analysis tool is available at https://coralscop.hkustvgd.com/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19998">arXiv:2410.19998</a> <span> [<a href="https://arxiv.org/pdf/2410.19998">pdf</a>, <a href="https://arxiv.org/format/2410.19998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3690639">10.1145/3690639 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Artificial Intelligence of Things: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Siam%2C+S+I">Shakhrul Iman Siam</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+H">Hyunho Ahn</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a>, <a href="/search/cs?searchtype=author&query=Alam%2C+S">Samiul Alam</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhichao Cao</a>, <a href="/search/cs?searchtype=author&query=Shroff%2C+N">Ness Shroff</a>, <a href="/search/cs?searchtype=author&query=Krishnamachari%2C+B">Bhaskar Krishnamachari</a>, <a href="/search/cs?searchtype=author&query=Srivastava%2C+M">Mani Srivastava</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19998v1-abstract-short" style="display: inline;"> The integration of the Internet of Things (IoT) and modern Artificial Intelligence (AI) has given rise to a new paradigm known as the Artificial Intelligence of Things (AIoT). In this survey, we provide a systematic and comprehensive review of AIoT research. We examine AIoT literature related to sensing, computing, and networking & communication, which form the three key components of AIoT. In add… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19998v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19998v1-abstract-full" style="display: none;"> The integration of the Internet of Things (IoT) and modern Artificial Intelligence (AI) has given rise to a new paradigm known as the Artificial Intelligence of Things (AIoT). In this survey, we provide a systematic and comprehensive review of AIoT research. We examine AIoT literature related to sensing, computing, and networking & communication, which form the three key components of AIoT. In addition to advancements in these areas, we review domain-specific AIoT systems that are designed for various important application domains. We have also created an accompanying GitHub repository, where we compile the papers included in this survey: https://github.com/AIoT-MLSys-Lab/AIoT-Survey. This repository will be actively maintained and updated with new research as it becomes available. As both IoT and AI become increasingly critical to our society, we believe AIoT is emerging as an essential research field at the intersection of IoT and modern AI. We hope this survey will serve as a valuable resource for those engaged in AIoT research and act as a catalyst for future explorations to bridge gaps and drive advancements in this exciting field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19998v1-abstract-full').style.display = 'none'; document.getElementById('2410.19998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ACM Transactions on Sensor Networks (TOSN)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ACM Trans. Sen. Netw.(August 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18912">arXiv:2410.18912</a> <span> [<a href="https://arxiv.org/pdf/2410.18912">pdf</a>, <a href="https://arxiv.org/format/2410.18912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dynamic 3D Gaussian Tracking for Graph-Based Neural Dynamics Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingtong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaifeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunzhu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18912v1-abstract-short" style="display: inline;"> Videos of robots interacting with objects encode rich information about the objects' dynamics. However, existing video prediction approaches typically do not explicitly account for the 3D information from videos, such as robot actions and objects' 3D states, limiting their use in real-world robotic applications. In this work, we introduce a framework to learn object dynamics directly from multi-vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18912v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18912v1-abstract-full" style="display: none;"> Videos of robots interacting with objects encode rich information about the objects' dynamics. However, existing video prediction approaches typically do not explicitly account for the 3D information from videos, such as robot actions and objects' 3D states, limiting their use in real-world robotic applications. In this work, we introduce a framework to learn object dynamics directly from multi-view RGB videos by explicitly considering the robot's action trajectories and their effects on scene dynamics. We utilize the 3D Gaussian representation of 3D Gaussian Splatting (3DGS) to train a particle-based dynamics model using Graph Neural Networks. This model operates on sparse control particles downsampled from the densely tracked 3D Gaussian reconstructions. By learning the neural dynamics model on offline robot interaction data, our method can predict object motions under varying initial configurations and unseen robot actions. The 3D transformations of Gaussians can be interpolated from the motions of control particles, enabling the rendering of predicted future object states and achieving action-conditioned video prediction. The dynamics model can also be applied to model-based planning frameworks for object manipulation tasks. We conduct experiments on various kinds of deformable materials, including ropes, clothes, and stuffed animals, demonstrating our framework's ability to model complex shapes and dynamics. Our project page is available at https://gs-dynamics.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18912v1-abstract-full').style.display = 'none'; document.getElementById('2410.18912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://gs-dynamics.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18797">arXiv:2410.18797</a> <span> [<a href="https://arxiv.org/pdf/2410.18797">pdf</a>, <a href="https://arxiv.org/format/2410.18797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Geodesics of Geometric Shape Deformations From Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+N">Nian Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miaomiao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18797v1-abstract-short" style="display: inline;"> This paper presents a novel method, named geodesic deformable networks (GDN), that for the first time enables the learning of geodesic flows of deformation fields derived from images. In particular, the capability of our proposed GDN being able to predict geodesics is important for quantifying and comparing deformable shape presented in images. The geodesic deformations, also known as optimal tran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18797v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18797v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18797v1-abstract-full" style="display: none;"> This paper presents a novel method, named geodesic deformable networks (GDN), that for the first time enables the learning of geodesic flows of deformation fields derived from images. In particular, the capability of our proposed GDN being able to predict geodesics is important for quantifying and comparing deformable shape presented in images. The geodesic deformations, also known as optimal transformations that align pairwise images, are often parameterized by a time sequence of smooth vector fields governed by nonlinear differential equations. A bountiful literature has been focusing on learning the initial conditions (e.g., initial velocity fields) based on registration networks. However, the definition of geodesics central to deformation-based shape analysis is blind to the networks. To address this problem, we carefully develop an efficient neural operator to treat the geodesics as unknown mapping functions learned from the latent deformation spaces. A composition of integral operators and smooth activation functions is then formulated to effectively approximate such mappings. In contrast to previous works, our GDN jointly optimizes a newly defined geodesic loss, which adds additional benefits to promote the network regularizability and generalizability. We demonstrate the effectiveness of GDN on both 2D synthetic data and 3D real brain magnetic resonance imaging (MRI). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18797v1-abstract-full').style.display = 'none'; document.getElementById('2410.18797v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18693">arXiv:2410.18693</a> <span> [<a href="https://arxiv.org/pdf/2410.18693">pdf</a>, <a href="https://arxiv.org/format/2410.18693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unleashing Reasoning Capability of LLMs via Scalable Question Synthesis from Scratch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yuyang Ding</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xinyu Shi</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaobo Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juntao Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiaoming Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18693v1-abstract-short" style="display: inline;"> The availability of high-quality data is one of the most important factors in improving the reasoning capability of LLMs. Existing works have demonstrated the effectiveness of creating more instruction data from seed questions or knowledge bases. Recent research indicates that continually scaling up data synthesis from strong models (e.g., GPT-4) can further elicit reasoning performance. Though pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18693v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18693v1-abstract-full" style="display: none;"> The availability of high-quality data is one of the most important factors in improving the reasoning capability of LLMs. Existing works have demonstrated the effectiveness of creating more instruction data from seed questions or knowledge bases. Recent research indicates that continually scaling up data synthesis from strong models (e.g., GPT-4) can further elicit reasoning performance. Though promising, the open-sourced community still lacks high-quality data at scale and scalable data synthesis methods with affordable costs. To address this, we introduce ScaleQuest, a scalable and novel data synthesis method that utilizes "small-size" (e.g., 7B) open-source models to generate questions from scratch without the need for seed data with complex augmentation constraints. With the efficient ScaleQuest, we automatically constructed a mathematical reasoning dataset consisting of 1 million problem-solution pairs, which are more effective than existing open-sourced datasets. It can universally increase the performance of mainstream open-source models (i.e., Mistral, Llama3, DeepSeekMath, and Qwen2-Math) by achieving 29.2% to 46.4% gains on MATH. Notably, simply fine-tuning the Qwen2-Math-7B-Base model with our dataset can even surpass Qwen2-Math-7B-Instruct, a strong and well-aligned model on closed-source data, and proprietary models such as GPT-4-Turbo and Claude-3.5 Sonnet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18693v1-abstract-full').style.display = 'none'; document.getElementById('2410.18693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Project page: https://scalequest.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18557">arXiv:2410.18557</a> <span> [<a href="https://arxiv.org/pdf/2410.18557">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Research on gesture recognition method based on SEDCNN-SVM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingjin Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianming Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18557v1-abstract-short" style="display: inline;"> Gesture recognition based on surface electromyographic signal (sEMG) is one of the most used methods. The traditional manual feature extraction can only extract some low-level signal features, this causes poor classifier performance and low recognition accuracy when dealing with some complex signals. A recognition method, namely SEDCNN-SVM, is proposed to recognize sEMG of different gestures. SEDC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18557v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18557v1-abstract-full" style="display: none;"> Gesture recognition based on surface electromyographic signal (sEMG) is one of the most used methods. The traditional manual feature extraction can only extract some low-level signal features, this causes poor classifier performance and low recognition accuracy when dealing with some complex signals. A recognition method, namely SEDCNN-SVM, is proposed to recognize sEMG of different gestures. SEDCNN-SVM consists of an improved deep convolutional neural network (DCNN) and a support vector machine (SVM). The DCNN can automatically extract and learn the feature information of sEMG through the convolution operation of the convolutional layer, so that it can capture the complex and high-level features in the data. The Squeeze and Excitation Networks (SE-Net) and the residual module were added to the model, so that the feature representation of each channel could be improved, the loss of feature information in convolutional operations was reduced, useful feature information was captured, and the problem of network gradient vanishing was eased. The SVM can improve the generalization ability and classification accuracy of the model by constructing an optimal hyperplane of the feature space. Hence, the SVM was used to replace the full connection layer and the Softmax function layer of the DCNN, the use of a suitable kernel function in SVM can improve the model's generalization ability and classification accuracy. To verify the effectiveness of the proposed classification algorithm, this method is analyzed and compared with other comparative classification methods. The recognition accuracy of SEDCNN-SVM can reach 0.955, it is significantly improved compared with other classification methods, the SEDCNN-SVM model is recognized online in real time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18557v1-abstract-full').style.display = 'none'; document.getElementById('2410.18557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18533">arXiv:2410.18533</a> <span> [<a href="https://arxiv.org/pdf/2410.18533">pdf</a>, <a href="https://arxiv.org/format/2410.18533">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LOGO -- Long cOntext aliGnment via efficient preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zecheng Tang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zechen Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juntao Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiaoming Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18533v1-abstract-short" style="display: inline;"> Long-context models(LCMs) have shown great potential in processing long input sequences(even more than 100M tokens) conveniently and effectively. With significant progress, recent research has pointed out that LCMs can accurately locate token-level salient information within the context. Yet, the generation performance of these LCMs is far from satisfactory and might result in misaligned responses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18533v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18533v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18533v1-abstract-full" style="display: none;"> Long-context models(LCMs) have shown great potential in processing long input sequences(even more than 100M tokens) conveniently and effectively. With significant progress, recent research has pointed out that LCMs can accurately locate token-level salient information within the context. Yet, the generation performance of these LCMs is far from satisfactory and might result in misaligned responses, such as hallucinations. To enhance the generation capability of LCMs, existing works have investigated the effects of data size and quality for both pre-training and instruction tuning. Though achieving meaningful improvement, previous methods fall short in either effectiveness or efficiency. In this paper, we introduce LOGO(Long cOntext aliGnment via efficient preference Optimization), a training strategy that first introduces preference optimization for long-context alignment. To overcome the GPU memory-bound issue caused by the long sequence, LOGO employs a reference-free preference optimization strategy and adopts a position synthesis method to construct the training data. By training with only 0.3B data on a single 8$\times$A800 GPU machine for 16 hours, LOGO allows the Llama-3-8B-Instruct-80K model to achieve comparable performance with GPT-4 in real-world long-context tasks while preserving the model's original capabilities on other tasks, e.g., language modeling and MMLU. Moreover, LOGO can extend the model's context window size while enhancing its generation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18533v1-abstract-full').style.display = 'none'; document.getElementById('2410.18533v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18434">arXiv:2410.18434</a> <span> [<a href="https://arxiv.org/pdf/2410.18434">pdf</a>, <a href="https://arxiv.org/format/2410.18434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> RediSwap: MEV Redistribution Mechanism for CFMMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengqian Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sen Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18434v1-abstract-short" style="display: inline;"> Automated Market Makers (AMMs) are essential to decentralized finance, offering continuous liquidity and enabling intermediary-free trading on blockchains. However, participants in AMMs are vulnerable to Maximal Extractable Value (MEV) exploitation. Users face threats such as front-running, back-running, and sandwich attacks, while liquidity providers (LPs) incur the loss-versus-rebalancing (LVR).… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18434v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18434v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18434v1-abstract-full" style="display: none;"> Automated Market Makers (AMMs) are essential to decentralized finance, offering continuous liquidity and enabling intermediary-free trading on blockchains. However, participants in AMMs are vulnerable to Maximal Extractable Value (MEV) exploitation. Users face threats such as front-running, back-running, and sandwich attacks, while liquidity providers (LPs) incur the loss-versus-rebalancing (LVR). In this paper, we introduce RediSwap, a novel AMM designed to capture MEV at the application level and refund it fairly among users and liquidity providers. At its core, RediSwap features an MEV-redistribution mechanism that manages arbitrage opportunities within the AMM pool. We formalize the mechanism design problem and the desired game-theoretical properties. A central insight underpinning our mechanism is the interpretation of the maximal MEV value as the sum of LVR and individual user losses. We prove that our mechanism is incentive-compatible and Sybil-proof, and demonstrate that it is easy for arbitrageurs to participate. We empirically compared RediSwap with existing solutions by replaying historical AMM trades. Our results suggest that RediSwap can achieve better execution than UniswapX in 89% of trades and reduce LPs' loss to under 0.5% of the original LVR in most cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18434v1-abstract-full').style.display = 'none'; document.getElementById('2410.18434v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository