CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 3,031 results for author: <span class="mathjax">He, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/" aria-role="search"> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="He, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=He%2C+Y&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="He, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18499">arXiv:2411.18499</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18499">pdf</a>, <a href="https://arxiv.org/format/2411.18499">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GATE OpenING: A Comprehensive Benchmark for Judging Open-ended Interleaved Image-Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Zhou%2C+P">Pengfei Zhou</a>, <a href="/search/?searchtype=author&amp;query=Peng%2C+X">Xiaopeng Peng</a>, <a href="/search/?searchtype=author&amp;query=Song%2C+J">Jiajun Song</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+C">Chuanhao Li</a>, <a href="/search/?searchtype=author&amp;query=Xu%2C+Z">Zhaopan Xu</a>, <a href="/search/?searchtype=author&amp;query=Yang%2C+Y">Yue Yang</a>, <a href="/search/?searchtype=author&amp;query=Guo%2C+Z">Ziyao Guo</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/?searchtype=author&amp;query=Lin%2C+Y">Yuqi Lin</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yefei He</a>, <a href="/search/?searchtype=author&amp;query=Zhao%2C+L">Lirui Zhao</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+S">Shuo Liu</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+T">Tianhua Li</a>, <a href="/search/?searchtype=author&amp;query=Xie%2C+Y">Yuxuan Xie</a>, <a href="/search/?searchtype=author&amp;query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/?searchtype=author&amp;query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/?searchtype=author&amp;query=Shao%2C+W">Wenqi Shao</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+K">Kaipeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18499v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have made significant strides in visual understanding and generation tasks. However, generating interleaved image-text content remains a challenge, which requires integrated multimodal understanding and generation abilities. While the progress in unified models offers new solutions, existing benchmarks are insufficient for evaluating these methods due to da&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18499v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18499v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18499v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have made significant strides in visual understanding and generation tasks. However, generating interleaved image-text content remains a challenge, which requires integrated multimodal understanding and generation abilities. While the progress in unified models offers new solutions, existing benchmarks are insufficient for evaluating these methods due to data size and diversity limitations. To bridge this gap, we introduce GATE OpenING (OpenING), a comprehensive benchmark comprising 5,400 high-quality human-annotated instances across 56 real-world tasks. OpenING covers diverse daily scenarios such as travel guide, design, and brainstorming, offering a robust platform for challenging interleaved generation methods. In addition, we present IntJudge, a judge model for evaluating open-ended multimodal generation methods. Trained with a novel data pipeline, our IntJudge achieves an agreement rate of 82. 42% with human judgments, outperforming GPT-based evaluators by 11.34%. Extensive experiments on OpenING reveal that current interleaved generation methods still have substantial room for improvement. Key findings on interleaved image-text generation are further presented to guide the development of next-generation models. The OpenING is open-sourced at https://opening.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18499v1-abstract-full').style.display = 'none'; document.getElementById('2411.18499v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">53 pages, 19 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18013">arXiv:2411.18013</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18013">pdf</a>, <a href="https://arxiv.org/format/2411.18013">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FASIONAD : FAst and Slow FusION Thinking Systems for Human-Like Autonomous Driving with Adaptive Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Qian%2C+K">Kangan Qian</a>, <a href="/search/?searchtype=author&amp;query=Ma%2C+Z">Zhikun Ma</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yangfan He</a>, <a href="/search/?searchtype=author&amp;query=Luo%2C+Z">Ziang Luo</a>, <a href="/search/?searchtype=author&amp;query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/?searchtype=author&amp;query=Zhu%2C+T">Tianze Zhu</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+J">Jiayin Li</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+Z">Ziyu Chen</a>, <a href="/search/?searchtype=author&amp;query=He%2C+X">Xiao He</a>, <a href="/search/?searchtype=author&amp;query=Shi%2C+Y">Yining Shi</a>, <a href="/search/?searchtype=author&amp;query=Fu%2C+Z">Zheng Fu</a>, <a href="/search/?searchtype=author&amp;query=Jiao%2C+X">Xinyu Jiao</a>, <a href="/search/?searchtype=author&amp;query=Jiang%2C+K">Kun Jiang</a>, <a href="/search/?searchtype=author&amp;query=Yang%2C+D">Diange Yang</a>, <a href="/search/?searchtype=author&amp;query=Matsumaru%2C+T">Takafumi Matsumaru</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18013v1-abstract-short" style="display: inline;"> Ensuring safe, comfortable, and efficient navigation is a critical goal for autonomous driving systems. While end-to-end models trained on large-scale datasets excel in common driving scenarios, they often struggle with rare, long-tail events. Recent progress in large language models (LLMs) has introduced enhanced reasoning capabilities, but their computational demands pose challenges for real-tim&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18013v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18013v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18013v1-abstract-full" style="display: none;"> Ensuring safe, comfortable, and efficient navigation is a critical goal for autonomous driving systems. While end-to-end models trained on large-scale datasets excel in common driving scenarios, they often struggle with rare, long-tail events. Recent progress in large language models (LLMs) has introduced enhanced reasoning capabilities, but their computational demands pose challenges for real-time decision-making and precise planning. This paper presents FASIONAD, a novel dual-system framework inspired by the cognitive model &#34;Thinking, Fast and Slow.&#34; The fast system handles routine navigation tasks using rapid, data-driven path planning, while the slow system focuses on complex reasoning and decision-making in challenging or unfamiliar situations. A dynamic switching mechanism based on score distribution and feedback allows seamless transitions between the two systems. Visual prompts generated by the fast system enable human-like reasoning in the slow system, which provides high-quality feedback to enhance the fast system&#39;s decision-making. To evaluate FASIONAD, we introduce a new benchmark derived from the nuScenes dataset, specifically designed to differentiate fast and slow scenarios. FASIONAD achieves state-of-the-art performance on this benchmark, establishing a new standard for frameworks integrating fast and slow cognitive processes in autonomous driving. This approach paves the way for more adaptive, human-like autonomous driving systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18013v1-abstract-full').style.display = 'none'; document.getElementById('2411.18013v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17138">arXiv:2411.17138</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17138">pdf</a>, <a href="https://arxiv.org/format/2411.17138">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> HGC: A hybrid method combining gravity model and cycle structure for identifying influential spreaders in complex networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Li%2C+J">Jiaxun Li</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yonghou He</a>, <a href="/search/?searchtype=author&amp;query=Dong%2C+Z">Zhefan Dong</a>, <a href="/search/?searchtype=author&amp;query=Tao%2C+L">Li Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17138v1-abstract-short" style="display: inline;"> Identifying influential spreaders in complex networks is a critical challenge in network science, with broad applications in disease control, information dissemination, and influence analysis in social networks. The gravity model, a distinctive approach for identifying influential spreaders, has attracted significant attention due to its ability to integrate node influence and the distance between&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17138v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17138v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17138v1-abstract-full" style="display: none;"> Identifying influential spreaders in complex networks is a critical challenge in network science, with broad applications in disease control, information dissemination, and influence analysis in social networks. The gravity model, a distinctive approach for identifying influential spreaders, has attracted significant attention due to its ability to integrate node influence and the distance between nodes. However, the law of gravity is symmetric, whereas the influence between different nodes is asymmetric. Existing gravity model-based methods commonly rely on the topological distance as a metric to measure the distance between nodes. Such reliance neglects the strength or frequency of connections between nodes, resulting in symmetric influence values between node pairs, which ultimately leads to an inaccurate assessment of node influence. Moreover, these methods often overlook cycle structures within networks, which provide redundant pathways for nodes and contribute significantly to the overall connectivity and stability of the network. In this paper, we propose a hybrid method called HGC, which integrates the gravity model with effective distance and incorporates cycle structure to address the issues above. Effective distance, derived from probabilities, measures the distance between a source node and others by considering its connectivity, providing a more accurate reflection of actual relationships between nodes. To evaluate the accuracy and effectiveness of the proposed method, we conducted several experiments on eight real-world networks based on the Susceptible-Infected-Recovered model. The results demonstrate that HGC outperforms seven compared methods in accurately identifying influential nodes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17138v1-abstract-full').style.display = 'none'; document.getElementById('2411.17138v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16774">arXiv:2411.16774</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16774">pdf</a>, <a href="https://arxiv.org/ps/2411.16774">ps</a>, <a href="https://arxiv.org/format/2411.16774">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="General Mathematics">math.GM</span> </div> </div> <p class="title is-5 mathjax"> A Note on a Recent Attempt to Prove the Irrationality of $味(5)$ </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Chen%2C+K">Keyu Chen</a>, <a href="/search/?searchtype=author&amp;query=He%2C+W">Wei He</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yixin He</a>, <a href="/search/?searchtype=author&amp;query=Huang%2C+Y">Yuxiang Huang</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+Y">Yanyang Li</a>, <a href="/search/?searchtype=author&amp;query=Tang%2C+Q">Quanyu Tang</a>, <a href="/search/?searchtype=author&amp;query=Wu%2C+L">Lei Wu</a>, <a href="/search/?searchtype=author&amp;query=Xu%2C+S">Shenhao Xu</a>, <a href="/search/?searchtype=author&amp;query=Yang%2C+S">Shuo Yang</a>, <a href="/search/?searchtype=author&amp;query=Yu%2C+Z">Zijun Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16774v2-abstract-short" style="display: inline;"> Recently Shekhar Suman [arXiv: 2407.07121v6 [math.GM] 3 Aug 2024] made an attempt to prove the irrationality of $味(5)$. But unfortunately the proof is not correct. In this note, we discuss the fallacy in the proof. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16774v2-abstract-full" style="display: none;"> Recently Shekhar Suman [arXiv: 2407.07121v6 [math.GM] 3 Aug 2024] made an attempt to prove the irrationality of $味(5)$. But unfortunately the proof is not correct. In this note, we discuss the fallacy in the proof. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16774v2-abstract-full').style.display = 'none'; document.getElementById('2411.16774v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, just a note</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> Primary 11J72; Secondary 11M06 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16652">arXiv:2411.16652</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16652">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> </div> </div> <p class="title is-5 mathjax"> Al0.68Sc0.32N/SiC based metal-ferroelectric-semiconductor capacitors operating up to 900 C </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yunfei He</a>, <a href="/search/?searchtype=author&amp;query=Moore%2C+D+C">David C. Moore</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+Y">Yubo Wang</a>, <a href="/search/?searchtype=author&amp;query=Ware%2C+S">Spencer Ware</a>, <a href="/search/?searchtype=author&amp;query=Ma%2C+S">Sizhe Ma</a>, <a href="/search/?searchtype=author&amp;query=Pradhan%2C+D+K">Dhiren K. Pradhan</a>, <a href="/search/?searchtype=author&amp;query=Hu%2C+Z">Zekun Hu</a>, <a href="/search/?searchtype=author&amp;query=Du%2C+X">Xingyu Du</a>, <a href="/search/?searchtype=author&amp;query=Kennedy%2C+W+J">W. Joshua Kennedy</a>, <a href="/search/?searchtype=author&amp;query=Glavin%2C+N+R">Nicholas R. Glavin</a>, <a href="/search/?searchtype=author&amp;query=Olsson%2C+R+H">Roy H. Olsson III</a>, <a href="/search/?searchtype=author&amp;query=Jariwala%2C+D">Deep Jariwala</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16652v1-abstract-short" style="display: inline;"> Ferroelectric (FE)-based devices show great promise for non-volatile memory applications, yet few demonstrate reliable operation at elevated temperatures. In this work, we fabricated and characterized metal ferroelectric semiconductor capacitors integrating Aluminum Scandium Nitride onto Silicon Carbide, a prospective high temperature semiconductor for logic operations in extreme environments. The&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16652v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16652v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16652v1-abstract-full" style="display: none;"> Ferroelectric (FE)-based devices show great promise for non-volatile memory applications, yet few demonstrate reliable operation at elevated temperatures. In this work, we fabricated and characterized metal ferroelectric semiconductor capacitors integrating Aluminum Scandium Nitride onto Silicon Carbide, a prospective high temperature semiconductor for logic operations in extreme environments. The resultant Ni/Al0.68Sc0.32N/4H-SiC structure was evaluated for non-volatile memory performance from room temperature to high-temperature conditions. The 30-nm thick Al0.68Sc0.32N/SiC-based ferroelectric capacitors demonstrated ferroelectric switching at 900 C. The coercive field of the FE layer decreased linearly from -6.4/+11.9 MV cm-1 at room temperature to -3.1/+7.8 MV cm-1 at 800 C. Using positive-up negative-down measurements, we characterized the temperature dependence of remanent polarization. At 600 C, the devices achieved remarkable reliability, demonstrating endurance of ~2000 cycles and retention exceeding 100 hours with negligible polarization loss. Further reliability measurements extended to 800 C with 10,000 secs retention and &gt; 300 endurance cycles, establish these devices as promising candidates for high-temperature memory applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16652v1-abstract-full').style.display = 'none'; document.getElementById('2411.16652v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16045">arXiv:2411.16045</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16045">pdf</a>, <a href="https://arxiv.org/format/2411.16045">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Number Theory">math.NT</span> </div> </div> <p class="title is-5 mathjax"> Dichotomy laws for the Hausdorff measure of shrinking target sets in $尾$-dynamical systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yubin He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16045v1-abstract-short" style="display: inline;"> In this paper, we investigate the Hausdorff measure of shrinking target sets in $尾$-dynamical systems. These sets are dynamically defined in analogy to the classical theory of weighted and multiplicative approximation. While the Lebesgue measure and Hausdorff dimension theories for these sets are well-understood, the Hausdorff measure theory in even one-dimensional settings remains unknown. We sho&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16045v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16045v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16045v1-abstract-full" style="display: none;"> In this paper, we investigate the Hausdorff measure of shrinking target sets in $尾$-dynamical systems. These sets are dynamically defined in analogy to the classical theory of weighted and multiplicative approximation. While the Lebesgue measure and Hausdorff dimension theories for these sets are well-understood, the Hausdorff measure theory in even one-dimensional settings remains unknown. We show that the Hausdorff measure of these sets is either zero or full depending upon the convergence or divergence of a certain series, thus providing a rather complete measure theoretic description of these sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16045v1-abstract-full').style.display = 'none'; document.getElementById('2411.16045v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15203">arXiv:2411.15203</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15203">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multimodal large language model for wheat breeding: a new exploration of smart breeding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Yang%2C+G">Guofeng Yang</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+Y">Yu Li</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yong He</a>, <a href="/search/?searchtype=author&amp;query=Zhou%2C+Z">Zhenjiang Zhou</a>, <a href="/search/?searchtype=author&amp;query=Ye%2C+L">Lingzhen Ye</a>, <a href="/search/?searchtype=author&amp;query=Fang%2C+H">Hui Fang</a>, <a href="/search/?searchtype=author&amp;query=Luo%2C+Y">Yiqi Luo</a>, <a href="/search/?searchtype=author&amp;query=Feng%2C+X">Xuping Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15203v1-abstract-short" style="display: inline;"> UAV remote sensing technology has become a key technology in crop breeding, which can achieve high-throughput and non-destructive collection of crop phenotyping data. However, the multidisciplinary nature of breeding has brought technical barriers and efficiency challenges to knowledge mining. Therefore, it is important to develop a smart breeding goal tool to mine cross-domain multimodal data. Ba&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15203v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15203v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15203v1-abstract-full" style="display: none;"> UAV remote sensing technology has become a key technology in crop breeding, which can achieve high-throughput and non-destructive collection of crop phenotyping data. However, the multidisciplinary nature of breeding has brought technical barriers and efficiency challenges to knowledge mining. Therefore, it is important to develop a smart breeding goal tool to mine cross-domain multimodal data. Based on different pre-trained open-source multimodal large language models (MLLMs) (e.g., Qwen-VL, InternVL, Deepseek-VL), this study used supervised fine-tuning (SFT), retrieval-augmented generation (RAG), and reinforcement learning from human feedback (RLHF) technologies to inject cross-domain knowledge into MLLMs, thereby constructing multiple multimodal large language models for wheat breeding (WBLMs). The above WBLMs were evaluated using the newly created evaluation benchmark in this study. The results showed that the WBLM constructed using SFT, RAG and RLHF technologies and InternVL2-8B has leading performance. Then, subsequent experiments were conducted using the WBLM. Ablation experiments indicated that the combination of SFT, RAG, and RLHF technologies can improve the overall generation performance, enhance the generated quality, balance the timeliness and adaptability of the generated answer, and reduce hallucinations and biases. The WBLM performed best in wheat yield prediction using cross-domain data (remote sensing, phenotyping, weather, germplasm) simultaneously, with R2 and RMSE of 0.821 and 489.254 kg/ha, respectively. Furthermore, the WBLM can generate professional decision support answers for phenotyping estimation, environmental stress assessment, target germplasm screening, cultivation technique recommendation, and seed price query tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15203v1-abstract-full').style.display = 'none'; document.getElementById('2411.15203v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15037">arXiv:2411.15037</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15037">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> </div> </div> <p class="title is-5 mathjax"> Ultra-High-Efficiency Dual-Band Thin-Film Lithium Niobate Modulator Incorporating Low-k Underfill with 220 GHz Extrapolated Bandwidth for 390 Gbit/s PAM8 Transmission </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Liu%2C+H">Hao Liu</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yutong He</a>, <a href="/search/?searchtype=author&amp;query=Xiong%2C+B">Bing Xiong</a>, <a href="/search/?searchtype=author&amp;query=Sun%2C+C">Changzheng Sun</a>, <a href="/search/?searchtype=author&amp;query=Hao%2C+Z">Zhibiao Hao</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+L">Lai Wang</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+J">Jian Wang</a>, <a href="/search/?searchtype=author&amp;query=Han%2C+Y">Yanjun Han</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+H">Hongtao Li</a>, <a href="/search/?searchtype=author&amp;query=Gan%2C+L">Lin Gan</a>, <a href="/search/?searchtype=author&amp;query=Luo%2C+Y">Yi Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15037v1-abstract-short" style="display: inline;"> High-performance electro-optic modulators play a critical role in modern telecommunication networks and intra-datacenter interconnects. Low driving voltage, large electro-optic bandwidth, compact device size, and multi-band operation ability are essential for various application scenarios, especially energy-efficient high-speed data transmission. However, it is challenging to meet all these requir&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15037v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15037v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15037v1-abstract-full" style="display: none;"> High-performance electro-optic modulators play a critical role in modern telecommunication networks and intra-datacenter interconnects. Low driving voltage, large electro-optic bandwidth, compact device size, and multi-band operation ability are essential for various application scenarios, especially energy-efficient high-speed data transmission. However, it is challenging to meet all these requirements simultaneously. Here, we demonstrate a high-performance dual-band thin-film lithium niobate electro-optic modulator with low-k underfill to achieve overall performance improvement. The low-k material helps reduce the RF loss of the modulator and achieve perfect velocity matching with narrow electrode gap to overcome the voltage-bandwidth limitation, extending electro-optic bandwidth and enhancing modulation efficiency simultaneously. The fabricated 7-mm-long modulator exhibits a low half-wave voltage of 1.9 V at C-band and 1.54 V at O-band, featuring a low half-wave voltage-length product of 1.33 V*cm and 1.08 V*cm, respectively. Meanwhile, the novel design yields an ultra-wide extrapolated 3 dB bandwidth of 220 GHz (218 GHz) in the C-band (O-band). High-speed data transmission in both C- and O-bands using the same device has been demonstrated for the first time by PAM8 with data rates up to 390 Gbit/s, corresponding to a record-low energy consumption of 0.69 fJ/bit for next-generation cost-effective ultra-high-speed optical communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15037v1-abstract-full').style.display = 'none'; document.getElementById('2411.15037v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14299">arXiv:2411.14299</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14299">pdf</a>, <a href="https://arxiv.org/format/2411.14299">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Masala-CHAI: A Large-Scale SPICE Netlist Dataset for Analog Circuits by Harnessing AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Bhandari%2C+J">Jitendra Bhandari</a>, <a href="/search/?searchtype=author&amp;query=Bhat%2C+V">Vineet Bhat</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yuheng He</a>, <a href="/search/?searchtype=author&amp;query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/?searchtype=author&amp;query=Rahmani%2C+H">Hamed Rahmani</a>, <a href="/search/?searchtype=author&amp;query=Karri%2C+R">Ramesh Karri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14299v2-abstract-short" style="display: inline;"> Masala-CHAI is the first fully automated framework leveraging large language models (LLMs) to generate Simulation Programs with Integrated Circuit Emphasis (SPICE) netlists. It addresses a long-standing challenge in automating netlist generation for analog circuits within circuit design automation. Automating this workflow could accelerate the creation of finetuned LLMs for analog circuit design a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14299v2-abstract-full').style.display = 'inline'; document.getElementById('2411.14299v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14299v2-abstract-full" style="display: none;"> Masala-CHAI is the first fully automated framework leveraging large language models (LLMs) to generate Simulation Programs with Integrated Circuit Emphasis (SPICE) netlists. It addresses a long-standing challenge in automating netlist generation for analog circuits within circuit design automation. Automating this workflow could accelerate the creation of finetuned LLMs for analog circuit design and verification. We identify key challenges in this automation and evaluate the multi-modal capabilities of state-of-the-art LLMs, particularly GPT-4, to address these issues. We propose a three-step workflow to overcome current limitations: labeling analog circuits, prompt tuning, and netlist verification. This approach aims to create an end-to-end SPICE netlist generator from circuit schematic images, tackling the long-standing hurdle of accurate netlist generation. Our framework demonstrates significant performance improvements, tested on approximately 2,100 schematics of varying complexity. We open-source this solution for community-driven development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14299v2-abstract-full').style.display = 'none'; document.getElementById('2411.14299v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13979">arXiv:2411.13979</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13979">pdf</a>, <a href="https://arxiv.org/format/2411.13979">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FedRAV: Hierarchically Federated Region-Learning for Traffic Object Classification of Autonomous Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Zhai%2C+Y">Yijun Zhai</a>, <a href="/search/?searchtype=author&amp;query=Zhou%2C+P">Pengzhan Zhou</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yuepeng He</a>, <a href="/search/?searchtype=author&amp;query=Qu%2C+F">Fang Qu</a>, <a href="/search/?searchtype=author&amp;query=Qin%2C+Z">Zhida Qin</a>, <a href="/search/?searchtype=author&amp;query=Jiao%2C+X">Xianlong Jiao</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+G">Guiyan Liu</a>, <a href="/search/?searchtype=author&amp;query=Guo%2C+S">Songtao Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13979v1-abstract-short" style="display: inline;"> The emerging federated learning enables distributed autonomous vehicles to train equipped deep learning models collaboratively without exposing their raw data, providing great potential for utilizing explosively growing autonomous driving data. However, considering the complicated traffic environments and driving scenarios, deploying federated learning for autonomous vehicles is inevitably challen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13979v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13979v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13979v1-abstract-full" style="display: none;"> The emerging federated learning enables distributed autonomous vehicles to train equipped deep learning models collaboratively without exposing their raw data, providing great potential for utilizing explosively growing autonomous driving data. However, considering the complicated traffic environments and driving scenarios, deploying federated learning for autonomous vehicles is inevitably challenged by non-independent and identically distributed (Non-IID) data of vehicles, which may lead to failed convergence and low training accuracy. In this paper, we propose a novel hierarchically Federated Region-learning framework of Autonomous Vehicles (FedRAV), a two-stage framework, which adaptively divides a large area containing vehicles into sub-regions based on the defined region-wise distance, and achieves personalized vehicular models and regional models. This approach ensures that the personalized vehicular model adopts the beneficial models while discarding the unprofitable ones. We validate our FedRAV framework against existing federated learning algorithms on three real-world autonomous driving datasets in various heterogeneous settings. The experiment results demonstrate that our framework outperforms those known algorithms, and improves the accuracy by at least 3.69%. The source code of FedRAV is available at: https://github.com/yjzhai-cs/FedRAV. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13979v1-abstract-full').style.display = 'none'; document.getElementById('2411.13979v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13503">arXiv:2411.13503</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13503">pdf</a>, <a href="https://arxiv.org/format/2411.13503">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VBench++: Comprehensive and Versatile Benchmark Suite for Video Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Huang%2C+Z">Ziqi Huang</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/?searchtype=author&amp;query=Xu%2C+X">Xiaojie Xu</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yinan He</a>, <a href="/search/?searchtype=author&amp;query=Yu%2C+J">Jiashuo Yu</a>, <a href="/search/?searchtype=author&amp;query=Dong%2C+Z">Ziyue Dong</a>, <a href="/search/?searchtype=author&amp;query=Ma%2C+Q">Qianli Ma</a>, <a href="/search/?searchtype=author&amp;query=Chanpaisit%2C+N">Nattapol Chanpaisit</a>, <a href="/search/?searchtype=author&amp;query=Si%2C+C">Chenyang Si</a>, <a href="/search/?searchtype=author&amp;query=Jiang%2C+Y">Yuming Jiang</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+Y">Yaohui Wang</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+X">Xinyuan Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+Y">Ying-Cong Chen</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+L">Limin Wang</a>, <a href="/search/?searchtype=author&amp;query=Lin%2C+D">Dahua Lin</a>, <a href="/search/?searchtype=author&amp;query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13503v1-abstract-short" style="display: inline;"> Video generation has witnessed significant advancements, yet evaluating these models remains a challenge. A comprehensive evaluation benchmark for video generation is indispensable for two reasons: 1) Existing metrics do not fully align with human perceptions; 2) An ideal evaluation system should provide insights to inform future developments of video generation. To this end, we present VBench, a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13503v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13503v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13503v1-abstract-full" style="display: none;"> Video generation has witnessed significant advancements, yet evaluating these models remains a challenge. A comprehensive evaluation benchmark for video generation is indispensable for two reasons: 1) Existing metrics do not fully align with human perceptions; 2) An ideal evaluation system should provide insights to inform future developments of video generation. To this end, we present VBench, a comprehensive benchmark suite that dissects &#34;video generation quality&#34; into specific, hierarchical, and disentangled dimensions, each with tailored prompts and evaluation methods. VBench has several appealing properties: 1) Comprehensive Dimensions: VBench comprises 16 dimensions in video generation (e.g., subject identity inconsistency, motion smoothness, temporal flickering, and spatial relationship, etc). The evaluation metrics with fine-grained levels reveal individual models&#39; strengths and weaknesses. 2) Human Alignment: We also provide a dataset of human preference annotations to validate our benchmarks&#39; alignment with human perception, for each evaluation dimension respectively. 3) Valuable Insights: We look into current models&#39; ability across various evaluation dimensions, and various content types. We also investigate the gaps between video and image generation models. 4) Versatile Benchmarking: VBench++ supports evaluating text-to-video and image-to-video. We introduce a high-quality Image Suite with an adaptive aspect ratio to enable fair evaluations across different image-to-video generation settings. Beyond assessing technical quality, VBench++ evaluates the trustworthiness of video generative models, providing a more holistic view of model performance. 5) Full Open-Sourcing: We fully open-source VBench++ and continually add new video generation models to our leaderboard to drive forward the field of video generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13503v1-abstract-full').style.display = 'none'; document.getElementById('2411.13503v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Leaderboard: https://huggingface.co/spaces/Vchitect/VBench_Leaderboard Code: https://github.com/Vchitect/VBench Project page: https://vchitect.github.io/VBench-project/ extension of arXiv:2311.17982. arXiv admin note: substantial text overlap with arXiv:2311.17982</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13418">arXiv:2411.13418</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13418">pdf</a>, <a href="https://arxiv.org/format/2411.13418">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Strongly Correlated Electrons">cond-mat.str-el</span> </div> </div> <p class="title is-5 mathjax"> Thermal Entropy, Density Disorder and Antiferromagnetism of Repulsive Fermions in 3D Optical Lattice </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Song%2C+Y">Yu-Feng Song</a>, <a href="/search/?searchtype=author&amp;query=Deng%2C+Y">Youjin Deng</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yuan-Yao He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13418v1-abstract-short" style="display: inline;"> The celebrated antiferromagnetic phase transition was realized in a most recent optical lattice experiment for 3D fermionic Hubbard model [Shao {\it et al}., Nature {\bf 632}, 267 (2024)]. Despite the great achievement, it was observed that the AFM structure factor (and also the critical entropy) reaches the maximum around the interaction strength $U/t\simeq 11.75$, which is significantly larger t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13418v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13418v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13418v1-abstract-full" style="display: none;"> The celebrated antiferromagnetic phase transition was realized in a most recent optical lattice experiment for 3D fermionic Hubbard model [Shao {\it et al}., Nature {\bf 632}, 267 (2024)]. Despite the great achievement, it was observed that the AFM structure factor (and also the critical entropy) reaches the maximum around the interaction strength $U/t\simeq 11.75$, which is significantly larger than the theoretical prediction as $U/t\simeq 8$. Here we resolve this discrepancy by studying the interplay between the thermal entropy, density disorder and antiferromagnetism of half-filled 3D Hubbard model with numerically exact auxiliary-field quantum Monte Carlo simulations. We have achieved accurate entropy phase diagram, which allows us to simulate arbitrary entropy path on the temperature-interaction plane and to track the experimental parameters. We then find that above discrepancy can be quantitatively explained by the {\it entropy increase} as enhancing the interaction in experiment, and together by the lattice {\it density disorder} existing in the experimental setup. We furthermore investigate the entropy dependence of double occupancy, and predict its universal behaviors which can be used as useful probes in future optical lattice experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13418v1-abstract-full').style.display = 'none'; document.getElementById('2411.13418v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13112">arXiv:2411.13112</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13112">pdf</a>, <a href="https://arxiv.org/format/2411.13112">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DriveMLLM: A Benchmark for Spatial Understanding with Multimodal Large Language Models in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Guo%2C+X">Xianda Guo</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+R">Ruijun Zhang</a>, <a href="/search/?searchtype=author&amp;query=Duan%2C+Y">Yiqun Duan</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yuhang He</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+C">Chenming Zhang</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+S">Shuai Liu</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13112v2-abstract-short" style="display: inline;"> Autonomous driving requires a comprehensive understanding of 3D environments to facilitate high-level tasks such as motion prediction, planning, and mapping. In this paper, we introduce DriveMLLM, a benchmark specifically designed to evaluate the spatial understanding capabilities of multimodal large language models (MLLMs) in autonomous driving. DriveMLLM includes 880 front-facing camera images a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13112v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13112v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13112v2-abstract-full" style="display: none;"> Autonomous driving requires a comprehensive understanding of 3D environments to facilitate high-level tasks such as motion prediction, planning, and mapping. In this paper, we introduce DriveMLLM, a benchmark specifically designed to evaluate the spatial understanding capabilities of multimodal large language models (MLLMs) in autonomous driving. DriveMLLM includes 880 front-facing camera images and introduces both absolute and relative spatial reasoning tasks, accompanied by linguistically diverse natural language questions. To measure MLLMs&#39; performance, we propose novel evaluation metrics focusing on spatial understanding. We evaluate several state-of-the-art MLLMs on DriveMLLM, and our results reveal the limitations of current models in understanding complex spatial relationships in driving contexts. We believe these findings underscore the need for more advanced MLLM-based spatial reasoning methods and highlight the potential for DriveMLLM to drive further research in autonomous driving. Code will be available at \url{https://github.com/XiandaGuo/Drive-MLLM}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13112v2-abstract-full').style.display = 'none'; document.getElementById('2411.13112v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code will be available at \url{https://github.com/XiandaGuo/Drive-MLLM}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12971">arXiv:2411.12971</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12971">pdf</a>, <a href="https://arxiv.org/ps/2411.12971">ps</a>, <a href="https://arxiv.org/format/2411.12971">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Geometric Topology">math.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Differential Geometry">math.DG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Spectral Theory">math.SP</span> </div> </div> <p class="title is-5 mathjax"> Averages of determinants of Laplacians over moduli spaces for large genus </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yuxin He</a>, <a href="/search/?searchtype=author&amp;query=Wu%2C+Y">Yunhui Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12971v1-abstract-short" style="display: inline;"> Let $\mathcal{M}_g$ be the moduli space of hyperbolic surfaces of genus $g$ endowed with the Weil-Petersson metric. We view the regularized determinant $\log \det(螖_{X})$ of Laplacian as a function on $\mathcal{M}_g$ and show that there exists a universal constant $E&gt;0$ such that as $g\to \infty$, (1) the expected value of $\left|\frac{\log \det(螖_{X})}{4蟺(g-1)}-E \right|$ over $\mathcal{M}_g$ h&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12971v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12971v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12971v1-abstract-full" style="display: none;"> Let $\mathcal{M}_g$ be the moduli space of hyperbolic surfaces of genus $g$ endowed with the Weil-Petersson metric. We view the regularized determinant $\log \det(螖_{X})$ of Laplacian as a function on $\mathcal{M}_g$ and show that there exists a universal constant $E&gt;0$ such that as $g\to \infty$, (1) the expected value of $\left|\frac{\log \det(螖_{X})}{4蟺(g-1)}-E \right|$ over $\mathcal{M}_g$ has rate of decay $g^{-未}$ for some uniform constant $未\in (0,1)$; (2) the expected value of $\left|\frac{\log \det(螖_{X})}{4蟺(g-1)}\right|^尾$ over $\mathcal{M}_g$ approaches to $E^尾$ whenever $尾\in [1,2)$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12971v1-abstract-full').style.display = 'none'; document.getElementById('2411.12971v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, comments are welcome</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12915">arXiv:2411.12915</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12915">pdf</a>, <a href="https://arxiv.org/format/2411.12915">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VILA-M3: Enhancing Vision-Language Models with Medical Expert Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Nath%2C+V">Vishwesh Nath</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+W">Wenqi Li</a>, <a href="/search/?searchtype=author&amp;query=Yang%2C+D">Dong Yang</a>, <a href="/search/?searchtype=author&amp;query=Myronenko%2C+A">Andriy Myronenko</a>, <a href="/search/?searchtype=author&amp;query=Zheng%2C+M">Mingxin Zheng</a>, <a href="/search/?searchtype=author&amp;query=Lu%2C+Y">Yao Lu</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+Z">Zhijian Liu</a>, <a href="/search/?searchtype=author&amp;query=Yin%2C+H">Hongxu Yin</a>, <a href="/search/?searchtype=author&amp;query=Law%2C+Y+M">Yee Man Law</a>, <a href="/search/?searchtype=author&amp;query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/?searchtype=author&amp;query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/?searchtype=author&amp;query=Zhao%2C+C">Can Zhao</a>, <a href="/search/?searchtype=author&amp;query=Xu%2C+Z">Ziyue Xu</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yufan He</a>, <a href="/search/?searchtype=author&amp;query=Heinrich%2C+G">Greg Heinrich</a>, <a href="/search/?searchtype=author&amp;query=Aylward%2C+S">Stephen Aylward</a>, <a href="/search/?searchtype=author&amp;query=Edgar%2C+M">Marc Edgar</a>, <a href="/search/?searchtype=author&amp;query=Zephyr%2C+M">Michael Zephyr</a>, <a href="/search/?searchtype=author&amp;query=Molchanov%2C+P">Pavlo Molchanov</a>, <a href="/search/?searchtype=author&amp;query=Turkbey%2C+B">Baris Turkbey</a>, <a href="/search/?searchtype=author&amp;query=Roth%2C+H">Holger Roth</a>, <a href="/search/?searchtype=author&amp;query=Xu%2C+D">Daguang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12915v1-abstract-short" style="display: inline;"> Generalist vision language models (VLMs) have made significant strides in computer vision, but they fall short in specialized fields like healthcare, where expert knowledge is essential. In traditional computer vision tasks, creative or approximate answers may be acceptable, but in healthcare, precision is paramount.Current large multimodal models like Gemini and GPT-4o are insufficient for medica&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12915v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12915v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12915v1-abstract-full" style="display: none;"> Generalist vision language models (VLMs) have made significant strides in computer vision, but they fall short in specialized fields like healthcare, where expert knowledge is essential. In traditional computer vision tasks, creative or approximate answers may be acceptable, but in healthcare, precision is paramount.Current large multimodal models like Gemini and GPT-4o are insufficient for medical tasks due to their reliance on memorized internet knowledge rather than the nuanced expertise required in healthcare. VLMs are usually trained in three stages: vision pre-training, vision-language pre-training, and instruction fine-tuning (IFT). IFT has been typically applied using a mixture of generic and healthcare data. In contrast, we propose that for medical VLMs, a fourth stage of specialized IFT is necessary, which focuses on medical data and includes information from domain expert models. Domain expert models developed for medical use are crucial because they are specifically trained for certain clinical tasks, e.g. to detect tumors and classify abnormalities through segmentation and classification, which learn fine-grained features of medical data$-$features that are often too intricate for a VLM to capture effectively especially in radiology. This paper introduces a new framework, VILA-M3, for medical VLMs that utilizes domain knowledge via expert models. Through our experiments, we show an improved state-of-the-art (SOTA) performance with an average improvement of ~9% over the prior SOTA model Med-Gemini and ~6% over models trained on the specific tasks. Our approach emphasizes the importance of domain expertise in creating precise, reliable VLMs for medical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12915v1-abstract-full').style.display = 'none'; document.getElementById('2411.12915v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11871">arXiv:2411.11871</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11871">pdf</a>, <a href="https://arxiv.org/format/2411.11871">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> MultiBalance: Multi-Objective Gradient Balancing in Industrial-Scale Multi-Task Recommendation System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yun He</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+X">Xuxing Chen</a>, <a href="/search/?searchtype=author&amp;query=Xu%2C+J">Jiayi Xu</a>, <a href="/search/?searchtype=author&amp;query=Cai%2C+R">Renqin Cai</a>, <a href="/search/?searchtype=author&amp;query=You%2C+Y">Yiling You</a>, <a href="/search/?searchtype=author&amp;query=Cao%2C+J">Jennifer Cao</a>, <a href="/search/?searchtype=author&amp;query=Huang%2C+M">Minhui Huang</a>, <a href="/search/?searchtype=author&amp;query=Yang%2C+L">Liu Yang</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+X">Xiaoyi Liu</a>, <a href="/search/?searchtype=author&amp;query=Jin%2C+R">Rong Jin</a>, <a href="/search/?searchtype=author&amp;query=Park%2C+S">Sem Park</a>, <a href="/search/?searchtype=author&amp;query=Long%2C+B">Bo Long</a>, <a href="/search/?searchtype=author&amp;query=Feng%2C+X">Xue Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11871v1-abstract-short" style="display: inline;"> In industrial recommendation systems, multi-task learning (learning multiple tasks simultaneously on a single model) is a predominant approach to save training/serving resources and improve recommendation performance via knowledge transfer between the joint learning tasks. However, multi-task learning often suffers from negative transfer: one or several tasks are less optimized than training them&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11871v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11871v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11871v1-abstract-full" style="display: none;"> In industrial recommendation systems, multi-task learning (learning multiple tasks simultaneously on a single model) is a predominant approach to save training/serving resources and improve recommendation performance via knowledge transfer between the joint learning tasks. However, multi-task learning often suffers from negative transfer: one or several tasks are less optimized than training them separately. To carefully balance the optimization, we propose a gradient balancing approach called MultiBalance, which is suitable for industrial-scale multi-task recommendation systems. It balances the per-task gradients to alleviate the negative transfer, while saving the huge cost for grid search or manual explorations for appropriate task weights. Moreover, compared with prior work that normally balance the per-task gradients of shared parameters, MultiBalance is more efficient since only requiring to access per-task gradients with respect to the shared feature representations. We conduct experiments on Meta&#39;s large-scale ads and feeds multi-task recommendation system, and observe that MultiBalance achieves significant gains (e.g., 0.738% improvement for normalized entropy (NE)) with neutral training cost in Queries Per Second (QPS), which is significantly more efficient than prior methods that balance per-task gradients of shared parameters with 70~80% QPS degradation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11871v1-abstract-full').style.display = 'none'; document.getElementById('2411.11871v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11628">arXiv:2411.11628</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11628">pdf</a>, <a href="https://arxiv.org/ps/2411.11628">ps</a>, <a href="https://arxiv.org/format/2411.11628">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Linear Convergence of the Proximal Gradient Method for Composite Optimization Under the Polyak-艁ojasiewicz Inequality and Its Variant </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Kong%2C+Q">Qingyuan Kong</a>, <a href="/search/?searchtype=author&amp;query=Jiang%2C+R">Rujun Jiang</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yihan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11628v1-abstract-short" style="display: inline;"> We study the linear convergence rates of the proximal gradient method for composite functions satisfying two classes of Polyak-艁ojasiewicz (PL) inequality: the PL inequality, the variant of PL inequality defined by the proximal map-based residual. Using the performance estimation problem, we either provide new explicit linear convergence rates or improve existing complexity bounds for minimizing c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11628v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11628v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11628v1-abstract-full" style="display: none;"> We study the linear convergence rates of the proximal gradient method for composite functions satisfying two classes of Polyak-艁ojasiewicz (PL) inequality: the PL inequality, the variant of PL inequality defined by the proximal map-based residual. Using the performance estimation problem, we either provide new explicit linear convergence rates or improve existing complexity bounds for minimizing composite functions under the two classes of PL inequality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11628v1-abstract-full').style.display = 'none'; document.getElementById('2411.11628v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10830">arXiv:2411.10830</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10830">pdf</a>, <a href="https://arxiv.org/format/2411.10830">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> One-Layer Transformer Provably Learns One-Nearest Neighbor In Context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Li%2C+Z">Zihao Li</a>, <a href="/search/?searchtype=author&amp;query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/?searchtype=author&amp;query=Gao%2C+C">Cheng Gao</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yihan He</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+H">Han Liu</a>, <a href="/search/?searchtype=author&amp;query=Klusowski%2C+J+M">Jason M. Klusowski</a>, <a href="/search/?searchtype=author&amp;query=Fan%2C+J">Jianqing Fan</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+M">Mengdi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10830v1-abstract-short" style="display: inline;"> Transformers have achieved great success in recent years. Interestingly, transformers have shown particularly strong in-context learning capability -- even without fine-tuning, they are still able to solve unseen tasks well purely based on task-specific prompts. In this paper, we study the capability of one-layer transformers in learning one of the most classical nonparametric estimators, the one-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10830v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10830v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10830v1-abstract-full" style="display: none;"> Transformers have achieved great success in recent years. Interestingly, transformers have shown particularly strong in-context learning capability -- even without fine-tuning, they are still able to solve unseen tasks well purely based on task-specific prompts. In this paper, we study the capability of one-layer transformers in learning one of the most classical nonparametric estimators, the one-nearest neighbor prediction rule. Under a theoretical framework where the prompt contains a sequence of labeled training data and unlabeled test data, we show that, although the loss function is nonconvex when trained with gradient descent, a single softmax attention layer can successfully learn to behave like a one-nearest neighbor classifier. Our result gives a concrete example of how transformers can be trained to implement nonparametric machine learning algorithms, and sheds light on the role of softmax attention in transformer models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10830v1-abstract-full').style.display = 'none'; document.getElementById('2411.10830v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10752">arXiv:2411.10752</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10752">pdf</a>, <a href="https://arxiv.org/format/2411.10752">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards a Comprehensive Benchmark for Pathological Lymph Node Metastasis in Breast Cancer Sections </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Ling%2C+X">Xitong Ling</a>, <a href="/search/?searchtype=author&amp;query=Lei%2C+Y">Yuanyuan Lei</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+J">Jiawen Li</a>, <a href="/search/?searchtype=author&amp;query=Cheng%2C+J">Junru Cheng</a>, <a href="/search/?searchtype=author&amp;query=Huang%2C+W">Wenting Huang</a>, <a href="/search/?searchtype=author&amp;query=Guan%2C+T">Tian Guan</a>, <a href="/search/?searchtype=author&amp;query=Guan%2C+J">Jian Guan</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yonghong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10752v1-abstract-short" style="display: inline;"> Advances in optical microscopy scanning have significantly contributed to computational pathology (CPath) by converting traditional histopathological slides into whole slide images (WSIs). This development enables comprehensive digital reviews by pathologists and accelerates AI-driven diagnostic support for WSI analysis. Recent advances in foundational pathology models have increased the need for&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10752v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10752v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10752v1-abstract-full" style="display: none;"> Advances in optical microscopy scanning have significantly contributed to computational pathology (CPath) by converting traditional histopathological slides into whole slide images (WSIs). This development enables comprehensive digital reviews by pathologists and accelerates AI-driven diagnostic support for WSI analysis. Recent advances in foundational pathology models have increased the need for benchmarking tasks. The Camelyon series is one of the most widely used open-source datasets in computational pathology. However, the quality, accessibility, and clinical relevance of the labels have not been comprehensively evaluated. In this study, we reprocessed 1,399 WSIs and labels from the Camelyon-16 and Camelyon-17 datasets, removing low-quality slides, correcting erroneous labels, and providing expert pixel annotations for tumor regions in the previously unreleased test set. Based on the sizes of re-annotated tumor regions, we upgraded the binary cancer screening task to a four-class task: negative, micro-metastasis, macro-metastasis, and Isolated Tumor Cells (ITC). We reevaluated pre-trained pathology feature extractors and multiple instance learning (MIL) methods using the cleaned dataset, providing a benchmark that advances AI development in histopathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10752v1-abstract-full').style.display = 'none'; document.getElementById('2411.10752v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10720">arXiv:2411.10720</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10720">pdf</a>, <a href="https://arxiv.org/format/2411.10720">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Multi Scale Graph Neural Network for Alzheimer&#39;s Disease </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Chauhan%2C+A">Anya Chauhan</a>, <a href="/search/?searchtype=author&amp;query=Noori%2C+A">Ayush Noori</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+Z">Zhaozhi Li</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yingnan He</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+M+M">Michelle M Li</a>, <a href="/search/?searchtype=author&amp;query=Zitnik%2C+M">Marinka Zitnik</a>, <a href="/search/?searchtype=author&amp;query=Das%2C+S">Sudeshna Das</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10720v1-abstract-short" style="display: inline;"> Alzheimer&#39;s disease (AD) is a complex, progressive neurodegenerative disorder characterized by extracellular A\b{eta} plaques, neurofibrillary tau tangles, glial activation, and neuronal degeneration, involving multiple cell types and pathways. Current models often overlook the cellular context of these pathways. To address this, we developed a multiscale graph neural network (GNN) model, ALZ PINN&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10720v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10720v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10720v1-abstract-full" style="display: none;"> Alzheimer&#39;s disease (AD) is a complex, progressive neurodegenerative disorder characterized by extracellular A\b{eta} plaques, neurofibrillary tau tangles, glial activation, and neuronal degeneration, involving multiple cell types and pathways. Current models often overlook the cellular context of these pathways. To address this, we developed a multiscale graph neural network (GNN) model, ALZ PINNACLE, using brain omics data from donors spanning the entire aging to AD spectrum. ALZ PINNACLE is based on the PINNACLE GNN framework, which learns context-aware protein, cell type, and tissue representations within a unified latent space. ALZ PINNACLE was trained on 14,951 proteins, 206,850 protein interactions, 7 cell types, and 48 cell subtypes or states. After pretraining, we investigated the learned embedding of APOE, the largest genetic risk factor for AD, across different cell types. Notably, APOE embeddings showed high similarity in microglial, neuronal, and CD8 cells, suggesting a similar role of APOE in these cell types. Fine tuning the model on AD risk genes revealed cell type contexts predictive of the role of APOE in AD. Our results suggest that ALZ PINNACLE may provide a valuable framework for uncovering novel insights into AD neurobiology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10720v1-abstract-full').style.display = 'none'; document.getElementById('2411.10720v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings paper presented at Machine Learning for Health (ML4H) symposium 2024, December 15-16, 2024, Vancouver, Canada, 9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10709">arXiv:2411.10709</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10709">pdf</a>, <a href="https://arxiv.org/format/2411.10709">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diagnostic Text-guided Representation Learning in Hierarchical Classification for Pathological Whole Slide Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Li%2C+J">Jiawen Li</a>, <a href="/search/?searchtype=author&amp;query=Sun%2C+Q">Qiehe Sun</a>, <a href="/search/?searchtype=author&amp;query=Yan%2C+R">Renao Yan</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+Y">Yizhi Wang</a>, <a href="/search/?searchtype=author&amp;query=Fu%2C+Y">Yuqiu Fu</a>, <a href="/search/?searchtype=author&amp;query=Wei%2C+Y">Yani Wei</a>, <a href="/search/?searchtype=author&amp;query=Guan%2C+T">Tian Guan</a>, <a href="/search/?searchtype=author&amp;query=Shi%2C+H">Huijuan Shi</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yonghonghe He</a>, <a href="/search/?searchtype=author&amp;query=Han%2C+A">Anjia Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10709v1-abstract-short" style="display: inline;"> With the development of digital imaging in medical microscopy, artificial intelligent-based analysis of pathological whole slide images (WSIs) provides a powerful tool for cancer diagnosis. Limited by the expensive cost of pixel-level annotation, current research primarily focuses on representation learning with slide-level labels, showing success in various downstream tasks. However, given the di&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10709v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10709v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10709v1-abstract-full" style="display: none;"> With the development of digital imaging in medical microscopy, artificial intelligent-based analysis of pathological whole slide images (WSIs) provides a powerful tool for cancer diagnosis. Limited by the expensive cost of pixel-level annotation, current research primarily focuses on representation learning with slide-level labels, showing success in various downstream tasks. However, given the diversity of lesion types and the complex relationships between each other, these techniques still deserve further exploration in addressing advanced pathology tasks. To this end, we introduce the concept of hierarchical pathological image classification and propose a representation learning called PathTree. PathTree considers the multi-classification of diseases as a binary tree structure. Each category is represented as a professional pathological text description, which messages information with a tree-like encoder. The interactive text features are then used to guide the aggregation of hierarchical multiple representations. PathTree uses slide-text similarity to obtain probability scores and introduces two extra tree specific losses to further constrain the association between texts and slides. Through extensive experiments on three challenging hierarchical classification datasets: in-house cryosectioned lung tissue lesion identification, public prostate cancer grade assessment, and public breast cancer subtyping, our proposed PathTree is consistently competitive compared to the state-of-the-art methods and provides a new perspective on the deep learning-assisted solution for more complex WSI classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10709v1-abstract-full').style.display = 'none'; document.getElementById('2411.10709v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 13 figures. Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10281">arXiv:2411.10281</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10281">pdf</a>, <a href="https://arxiv.org/format/2411.10281">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multidimensional Byte Pair Encoding: Shortened Sequences for Improved Visual Data Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Elsner%2C+T">Tim Elsner</a>, <a href="/search/?searchtype=author&amp;query=Usinger%2C+P">Paula Usinger</a>, <a href="/search/?searchtype=author&amp;query=Nehring-Wirxel%2C+J">Julius Nehring-Wirxel</a>, <a href="/search/?searchtype=author&amp;query=Kobsik%2C+G">Gregor Kobsik</a>, <a href="/search/?searchtype=author&amp;query=Czech%2C+V">Victor Czech</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yanjiang He</a>, <a href="/search/?searchtype=author&amp;query=Lim%2C+I">Isaak Lim</a>, <a href="/search/?searchtype=author&amp;query=Kobbelt%2C+L">Leif Kobbelt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10281v1-abstract-short" style="display: inline;"> In language processing, transformers benefit greatly from text being condensed. This is achieved through a larger vocabulary that captures word fragments instead of plain characters. This is often done with Byte Pair Encoding. In the context of images, tokenisation of visual data is usually limited to regular grids obtained from quantisation methods, without global content awareness. Our work impr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10281v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10281v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10281v1-abstract-full" style="display: none;"> In language processing, transformers benefit greatly from text being condensed. This is achieved through a larger vocabulary that captures word fragments instead of plain characters. This is often done with Byte Pair Encoding. In the context of images, tokenisation of visual data is usually limited to regular grids obtained from quantisation methods, without global content awareness. Our work improves tokenisation of visual data by bringing Byte Pair Encoding from 1D to multiple dimensions, as a complementary add-on to existing compression. We achieve this through counting constellations of token pairs and replacing the most frequent token pair with a newly introduced token. The multidimensionality only increases the computation time by a factor of 2 for images, making it applicable even to large datasets like ImageNet within minutes on consumer hardware. This is a lossless preprocessing step. Our evaluation shows improved training and inference performance of transformers on visual data achieved by compressing frequent constellations of tokens: The resulting sequences are shorter, with more uniformly distributed information content, e.g. condensing empty regions in an image into single tokens. As our experiments show, these condensed sequences are easier to process. We additionally introduce a strategy to amplify this compression further by clustering the vocabulary. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10281v1-abstract-full').style.display = 'none'; document.getElementById('2411.10281v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09924">arXiv:2411.09924</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09924">pdf</a>, <a href="https://arxiv.org/format/2411.09924">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Polarization Image Dehazing Method Based on the Principle of Physical Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Z">Zhenjun Zhang</a>, <a href="/search/?searchtype=author&amp;query=Tang%2C+L">Lijun Tang</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+H">Hongjin Wang</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+L">Lilian Zhang</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yunze He</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+Y">Yaonan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09924v1-abstract-short" style="display: inline;"> Computer vision is increasingly used in areas such as unmanned vehicles, surveillance systems and remote sensing. However, in foggy scenarios, image degradation leads to loss of target details, which seriously affects the accuracy and effectiveness of these vision tasks. Polarized light, due to the fact that its electromagnetic waves vibrate in a specific direction, is able to resist scattering an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09924v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09924v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09924v1-abstract-full" style="display: none;"> Computer vision is increasingly used in areas such as unmanned vehicles, surveillance systems and remote sensing. However, in foggy scenarios, image degradation leads to loss of target details, which seriously affects the accuracy and effectiveness of these vision tasks. Polarized light, due to the fact that its electromagnetic waves vibrate in a specific direction, is able to resist scattering and refraction effects in complex media more effectively compared to unpolarized light. As a result, polarized light has a greater ability to maintain its polarization characteristics in complex transmission media and under long-distance imaging conditions. This property makes polarized imaging especially suitable for complex scenes such as outdoor and underwater, especially in foggy environments, where higher quality images can be obtained. Based on this advantage, we propose an innovative semi-physical polarization dehazing method that does not rely on an external light source. The method simulates the diffusion process of fog and designs a diffusion kernel that corresponds to the image blurriness caused by this diffusion. By employing spatiotemporal Fourier transforms and deconvolution operations, the method recovers the state of fog droplets prior to diffusion and the light inversion distribution of objects. This approach effectively achieves dehazing and detail enhancement of the scene. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09924v1-abstract-full').style.display = 'none'; document.getElementById('2411.09924v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09618">arXiv:2411.09618</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09618">pdf</a>, <a href="https://arxiv.org/format/2411.09618">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.59275/j.melba.2024-9c68">10.59275/j.melba.2024-9c68 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MICCAI-CDMRI 2023 QuantConn Challenge Findings on Achieving Robust Quantitative Connectivity through Harmonized Preprocessing of Diffusion MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Newlin%2C+N+R">Nancy R. Newlin</a>, <a href="/search/?searchtype=author&amp;query=Schilling%2C+K">Kurt Schilling</a>, <a href="/search/?searchtype=author&amp;query=Koudoro%2C+S">Serge Koudoro</a>, <a href="/search/?searchtype=author&amp;query=Chandio%2C+B+Q">Bramsh Qamar Chandio</a>, <a href="/search/?searchtype=author&amp;query=Kanakaraj%2C+P">Praitayini Kanakaraj</a>, <a href="/search/?searchtype=author&amp;query=Moyer%2C+D">Daniel Moyer</a>, <a href="/search/?searchtype=author&amp;query=Kelly%2C+C+E">Claire E. Kelly</a>, <a href="/search/?searchtype=author&amp;query=Genc%2C+S">Sila Genc</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+J">Jian Chen</a>, <a href="/search/?searchtype=author&amp;query=Yang%2C+J+Y">Joseph Yuan-Mou Yang</a>, <a href="/search/?searchtype=author&amp;query=Wu%2C+Y">Ye Wu</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yifei He</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/?searchtype=author&amp;query=Zeng%2C+Q">Qingrun Zeng</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/?searchtype=author&amp;query=Adluru%2C+N">Nagesh Adluru</a>, <a href="/search/?searchtype=author&amp;query=Nath%2C+V">Vishwesh Nath</a>, <a href="/search/?searchtype=author&amp;query=Pathak%2C+S">Sudhir Pathak</a>, <a href="/search/?searchtype=author&amp;query=Schneider%2C+W">Walter Schneider</a>, <a href="/search/?searchtype=author&amp;query=Gade%2C+A">Anurag Gade</a>, <a href="/search/?searchtype=author&amp;query=Rathi%2C+Y">Yogesh Rathi</a>, <a href="/search/?searchtype=author&amp;query=Hendriks%2C+T">Tom Hendriks</a>, <a href="/search/?searchtype=author&amp;query=Vilanova%2C+A">Anna Vilanova</a>, <a href="/search/?searchtype=author&amp;query=Chamberland%2C+M">Maxime Chamberland</a>, <a href="/search/?searchtype=author&amp;query=Pieciak%2C+T">Tomasz Pieciak</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09618v1-abstract-short" style="display: inline;"> White matter alterations are increasingly implicated in neurological diseases and their progression. International-scale studies use diffusion-weighted magnetic resonance imaging (DW-MRI) to qualitatively identify changes in white matter microstructure and connectivity. Yet, quantitative analysis of DW-MRI data is hindered by inconsistencies stemming from varying acquisition protocols. There is a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09618v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09618v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09618v1-abstract-full" style="display: none;"> White matter alterations are increasingly implicated in neurological diseases and their progression. International-scale studies use diffusion-weighted magnetic resonance imaging (DW-MRI) to qualitatively identify changes in white matter microstructure and connectivity. Yet, quantitative analysis of DW-MRI data is hindered by inconsistencies stemming from varying acquisition protocols. There is a pressing need to harmonize the preprocessing of DW-MRI datasets to ensure the derivation of robust quantitative diffusion metrics across acquisitions. In the MICCAI-CDMRI 2023 QuantConn challenge, participants were provided raw data from the same individuals collected on the same scanner but with two different acquisitions and tasked with preprocessing the DW-MRI to minimize acquisition differences while retaining biological variation. Submissions are evaluated on the reproducibility and comparability of cross-acquisition bundle-wise microstructure measures, bundle shape features, and connectomics. The key innovations of the QuantConn challenge are that (1) we assess bundles and tractography in the context of harmonization for the first time, (2) we assess connectomics in the context of harmonization for the first time, and (3) we have 10x additional subjects over prior harmonization challenge, MUSHAC and 100x over SuperMUDI. We find that bundle surface area, fractional anisotropy, connectome assortativity, betweenness centrality, edge count, modularity, nodal strength, and participation coefficient measures are most biased by acquisition and that machine learning voxel-wise correction, RISH mapping, and NeSH methods effectively reduce these biases. In addition, microstructure measures AD, MD, RD, bundle length, connectome density, efficiency, and path length are least biased by these acquisition differences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09618v1-abstract-full').style.display = 'none'; document.getElementById('2411.09618v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at the Journal of Machine Learning for Biomedical Imaging (MELBA) https://melba-journal.org/2024/019</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Machine.Learning.for.Biomedical.Imaging. 2 (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08453">arXiv:2411.08453</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08453">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Biomass phenotyping of oilseed rape through UAV multi-view oblique imaging with 3DGS and SAM model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Shen%2C+Y">Yutao Shen</a>, <a href="/search/?searchtype=author&amp;query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/?searchtype=author&amp;query=Yang%2C+X">Xin Yang</a>, <a href="/search/?searchtype=author&amp;query=Lu%2C+X">Xuqi Lu</a>, <a href="/search/?searchtype=author&amp;query=Guo%2C+Z">Ziyue Guo</a>, <a href="/search/?searchtype=author&amp;query=Jiang%2C+L">Lixi Jiang</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yong He</a>, <a href="/search/?searchtype=author&amp;query=Cen%2C+H">Haiyan Cen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08453v1-abstract-short" style="display: inline;"> Biomass estimation of oilseed rape is crucial for optimizing crop productivity and breeding strategies. While UAV-based imaging has advanced high-throughput phenotyping, current methods often rely on orthophoto images, which struggle with overlapping leaves and incomplete structural information in complex field environments. This study integrates 3D Gaussian Splatting (3DGS) with the Segment Anyth&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08453v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08453v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08453v1-abstract-full" style="display: none;"> Biomass estimation of oilseed rape is crucial for optimizing crop productivity and breeding strategies. While UAV-based imaging has advanced high-throughput phenotyping, current methods often rely on orthophoto images, which struggle with overlapping leaves and incomplete structural information in complex field environments. This study integrates 3D Gaussian Splatting (3DGS) with the Segment Anything Model (SAM) for precise 3D reconstruction and biomass estimation of oilseed rape. UAV multi-view oblique images from 36 angles were used to perform 3D reconstruction, with the SAM module enhancing point cloud segmentation. The segmented point clouds were then converted into point cloud volumes, which were fitted to ground-measured biomass using linear regression. The results showed that 3DGS (7k and 30k iterations) provided high accuracy, with peak signal-to-noise ratios (PSNR) of 27.43 and 29.53 and training times of 7 and 49 minutes, respectively. This performance exceeded that of structure from motion (SfM) and mipmap Neural Radiance Fields (Mip-NeRF), demonstrating superior efficiency. The SAM module achieved high segmentation accuracy, with a mean intersection over union (mIoU) of 0.961 and an F1-score of 0.980. Additionally, a comparison of biomass extraction models found the point cloud volume model to be the most accurate, with an determination coefficient (R2) of 0.976, root mean square error (RMSE) of 2.92 g/plant, and mean absolute percentage error (MAPE) of 6.81%, outperforming both the plot crop volume and individual crop volume models. This study highlights the potential of combining 3DGS with multi-view UAV imaging for improved biomass phenotyping. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08453v1-abstract-full').style.display = 'none'; document.getElementById('2411.08453v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07618">arXiv:2411.07618</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07618">pdf</a>, <a href="https://arxiv.org/format/2411.07618">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Direct Preference Optimization Using Sparse Feature-Level Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Yin%2C+Q">Qingyu Yin</a>, <a href="/search/?searchtype=author&amp;query=Leong%2C+C+T">Chak Tou Leong</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+H">Hongbo Zhang</a>, <a href="/search/?searchtype=author&amp;query=Zhu%2C+M">Minjun Zhu</a>, <a href="/search/?searchtype=author&amp;query=Yan%2C+H">Hanqi Yan</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yulan He</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+W">Wenjie Li</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+J">Jun Wang</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/?searchtype=author&amp;query=Yang%2C+L">Linyi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07618v1-abstract-short" style="display: inline;"> The alignment of large language models (LLMs) with human preferences remains a key challenge. While post-training techniques like Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO) have achieved notable success, they often introduce computational inefficiencies and training instability. In this paper, we propose Feature-level constrained Preference Optimizat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07618v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07618v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07618v1-abstract-full" style="display: none;"> The alignment of large language models (LLMs) with human preferences remains a key challenge. While post-training techniques like Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO) have achieved notable success, they often introduce computational inefficiencies and training instability. In this paper, we propose Feature-level constrained Preference Optimization (FPO), a novel method designed to simplify the alignment process while ensuring stability. FPO leverages pre-trained Sparse Autoencoders (SAEs) and introduces feature-level constraints, allowing for efficient, sparsity-enforced alignment. Our approach enjoys efficiency by using sparse features activated in a well-trained sparse autoencoder and the quality of sequential KL divergence by using the feature-level offline reference. Experimental results on benchmark datasets demonstrate that FPO achieves a 5.08% absolute improvement in win rate with much lower computational cost compared to state-of-the-art baselines, making it a promising solution for efficient and controllable LLM alignments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07618v1-abstract-full').style.display = 'none'; document.getElementById('2411.07618v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07569">arXiv:2411.07569</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07569">pdf</a>, <a href="https://arxiv.org/format/2411.07569">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Towards Automated Model Design on Recommender Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Zhang%2C+T">Tunhou Zhang</a>, <a href="/search/?searchtype=author&amp;query=Cheng%2C+D">Dehua Cheng</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yuchen He</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+Z">Zhengxing Chen</a>, <a href="/search/?searchtype=author&amp;query=Dai%2C+X">Xiaoliang Dai</a>, <a href="/search/?searchtype=author&amp;query=Xiong%2C+L">Liang Xiong</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+Y">Yudong Liu</a>, <a href="/search/?searchtype=author&amp;query=Cheng%2C+F">Feng Cheng</a>, <a href="/search/?searchtype=author&amp;query=Cao%2C+Y">Yufan Cao</a>, <a href="/search/?searchtype=author&amp;query=Yan%2C+F">Feng Yan</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+H">Hai Li</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+Y">Yiran Chen</a>, <a href="/search/?searchtype=author&amp;query=Wen%2C+W">Wei Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07569v1-abstract-short" style="display: inline;"> The increasing popularity of deep learning models has created new opportunities for developing AI-based recommender systems. Designing recommender systems using deep neural networks requires careful architecture design, and further optimization demands extensive co-design efforts on jointly optimizing model architecture and hardware. Design automation, such as Automated Machine Learning (AutoML),&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07569v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07569v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07569v1-abstract-full" style="display: none;"> The increasing popularity of deep learning models has created new opportunities for developing AI-based recommender systems. Designing recommender systems using deep neural networks requires careful architecture design, and further optimization demands extensive co-design efforts on jointly optimizing model architecture and hardware. Design automation, such as Automated Machine Learning (AutoML), is necessary to fully exploit the potential of recommender model design, including model choices and model-hardware co-design strategies. We introduce a novel paradigm that utilizes weight sharing to explore abundant solution spaces. Our paradigm creates a large supernet to search for optimal architectures and co-design strategies to address the challenges of data multi-modality and heterogeneity in the recommendation domain. From a model perspective, the supernet includes a variety of operators, dense connectivity, and dimension search options. From a co-design perspective, it encompasses versatile Processing-In-Memory (PIM) configurations to produce hardware-efficient models. Our solution space&#39;s scale, heterogeneity, and complexity pose several challenges, which we address by proposing various techniques for training and evaluating the supernet. Our crafted models show promising results on three Click-Through Rates (CTR) prediction benchmarks, outperforming both manually designed and AutoML-crafted models with state-of-the-art performance when focusing solely on architecture search. From a co-design perspective, we achieve 2x FLOPs efficiency, 1.8x energy efficiency, and 1.5x performance improvements in recommender models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07569v1-abstract-full').style.display = 'none'; document.getElementById('2411.07569v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ACM Transactions on Recommender Systems. arXiv admin note: substantial text overlap with arXiv:2207.07187</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ACM Transactions on Recommender Systems (TORS) 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07515">arXiv:2411.07515</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07515">pdf</a>, <a href="https://arxiv.org/format/2411.07515">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bayesian Deep Learning Approach for Real-time Lane-based Arrival Curve Reconstruction at Intersection using License Plate Recognition Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yang He</a>, <a href="/search/?searchtype=author&amp;query=An%2C+C">Chengchuan An</a>, <a href="/search/?searchtype=author&amp;query=Lu%2C+J">Jiawei Lu</a>, <a href="/search/?searchtype=author&amp;query=Wu%2C+Y">Yao-Jan Wu</a>, <a href="/search/?searchtype=author&amp;query=Lu%2C+Z">Zhenbo Lu</a>, <a href="/search/?searchtype=author&amp;query=Xia%2C+J">Jingxin Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07515v1-abstract-short" style="display: inline;"> The acquisition of real-time and accurate traffic arrival information is of vital importance for proactive traffic control systems, especially in partially connected vehicle environments. License plate recognition (LPR) data that record both vehicle departures and identities are proven to be desirable in reconstructing lane-based arrival curves in previous works. Existing LPR databased methods are&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07515v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07515v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07515v1-abstract-full" style="display: none;"> The acquisition of real-time and accurate traffic arrival information is of vital importance for proactive traffic control systems, especially in partially connected vehicle environments. License plate recognition (LPR) data that record both vehicle departures and identities are proven to be desirable in reconstructing lane-based arrival curves in previous works. Existing LPR databased methods are predominantly designed for reconstructing historical arrival curves. For real-time reconstruction of multi-lane urban roads, it is pivotal to determine the lane choice of real-time link-based arrivals, which has not been exploited in previous studies. In this study, we propose a Bayesian deep learning approach for real-time lane-based arrival curve reconstruction, in which the lane choice patterns and uncertainties of link-based arrivals are both characterized. Specifically, the learning process is designed to effectively capture the relationship between partially observed link-based arrivals and lane-based arrivals, which can be physically interpreted as lane choice proportion. Moreover, the lane choice uncertainties are characterized using Bayesian parameter inference techniques, minimizing arrival curve reconstruction uncertainties, especially in low LPR data matching rate conditions. Real-world experiment results conducted in multiple matching rate scenarios demonstrate the superiority and necessity of lane choice modeling in reconstructing arrival curves. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07515v1-abstract-full').style.display = 'none'; document.getElementById('2411.07515v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by T-ITS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07489">arXiv:2411.07489</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07489">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Detectors">physics.ins-det</span> </div> </div> <p class="title is-5 mathjax"> An Exploration of Parallel Imaging System for Very-low Field (50mT) MRI Scanner </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Yang%2C+L">Lei Yang</a>, <a href="/search/?searchtype=author&amp;query=He%2C+W">Wei He</a>, <a href="/search/?searchtype=author&amp;query=Shen%2C+S">Sheng Shen</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yucheng He</a>, <a href="/search/?searchtype=author&amp;query=Wu%2C+J">Jiamin Wu</a>, <a href="/search/?searchtype=author&amp;query=Xu%2C+Z">Zheng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07489v1-abstract-short" style="display: inline;"> Reducing the scanning time of very-low field (VLF) magnetic resonance imaging (MRI) scanners, commonly employed for stroke diagnosis, can enhance patient comfort and operational efficiency. The conventional parallel imaging (PI) technique for high-field MRI should be tailored to apply here, considering the differences in the direction of the main magnetic field and the presence of noise. A VLF-spe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07489v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07489v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07489v1-abstract-full" style="display: none;"> Reducing the scanning time of very-low field (VLF) magnetic resonance imaging (MRI) scanners, commonly employed for stroke diagnosis, can enhance patient comfort and operational efficiency. The conventional parallel imaging (PI) technique for high-field MRI should be tailored to apply here, considering the differences in the direction of the main magnetic field and the presence of noise. A VLF-specific PI algorithm and phased-array coil are proposed, marking the first application of PI in VLF MRI. Reconstruction quality is enhanced by denoising undersampled k-space data using a linear-prediction based Kalman filter. Subsequently, the denoised k-space data are nonlinearly mapped from the original space onto a high-dimensional feature space, utilizing a polynomial feature mapping defined nonlinear frame. Frame parameters are calculated using auto-calibration signals (ACS) from the center k-space, and missing phase-encoding lines in the original space are estimated using acquired lines in the feature space. An 8-channel phased-array coil, designed for a vertical main magnetic field, is decoupled using geometric overlap and a low input impedance (LII) preamplifier. Healthy volunteer head imaging experiments using the proposed PI technique exhibit the lowest mean-squared-error (MSE) value and the highest peak-signal-to-noise (PSNR) and structural similarity index (SSIM) values compared to two widely used PI methods. The proposed PI technique enables the VLF MRI scanner to achieve similar image quality and a 72.5% improvement in signal-to-noise ratio (SNR) compared to fully sampled images while requiring less than 50% of the scan time. We present a PI technique tailored for VLF MRI scanner for the first time, along with potential research direction to achieve greater reduction factor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07489v1-abstract-full').style.display = 'none'; document.getElementById('2411.07489v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE Transactions on Instrumentation and Measurement</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07140">arXiv:2411.07140</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07140">pdf</a>, <a href="https://arxiv.org/format/2411.07140">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Chinese SimpleQA: A Chinese Factuality Evaluation for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yancheng He</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+S">Shilong Li</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/?searchtype=author&amp;query=Tan%2C+Y">Yingshui Tan</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+W">Weixun Wang</a>, <a href="/search/?searchtype=author&amp;query=Huang%2C+H">Hui Huang</a>, <a href="/search/?searchtype=author&amp;query=Bu%2C+X">Xingyuan Bu</a>, <a href="/search/?searchtype=author&amp;query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/?searchtype=author&amp;query=Hu%2C+C">Chengwei Hu</a>, <a href="/search/?searchtype=author&amp;query=Zheng%2C+B">Boren Zheng</a>, <a href="/search/?searchtype=author&amp;query=Lin%2C+Z">Zhuoran Lin</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+X">Xuepeng Liu</a>, <a href="/search/?searchtype=author&amp;query=Sun%2C+D">Dekai Sun</a>, <a href="/search/?searchtype=author&amp;query=Lin%2C+S">Shirong Lin</a>, <a href="/search/?searchtype=author&amp;query=Zheng%2C+Z">Zhicheng Zheng</a>, <a href="/search/?searchtype=author&amp;query=Zhu%2C+X">Xiaoyong Zhu</a>, <a href="/search/?searchtype=author&amp;query=Su%2C+W">Wenbo Su</a>, <a href="/search/?searchtype=author&amp;query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07140v2-abstract-short" style="display: inline;"> New LLM evaluation benchmarks are important to align with the rapid development of Large Language Models (LLMs). In this work, we present Chinese SimpleQA, the first comprehensive Chinese benchmark to evaluate the factuality ability of language models to answer short questions, and Chinese SimpleQA mainly has five properties (i.e., Chinese, Diverse, High-quality, Static, Easy-to-evaluate). Specifi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07140v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07140v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07140v2-abstract-full" style="display: none;"> New LLM evaluation benchmarks are important to align with the rapid development of Large Language Models (LLMs). In this work, we present Chinese SimpleQA, the first comprehensive Chinese benchmark to evaluate the factuality ability of language models to answer short questions, and Chinese SimpleQA mainly has five properties (i.e., Chinese, Diverse, High-quality, Static, Easy-to-evaluate). Specifically, first, we focus on the Chinese language over 6 major topics with 99 diverse subtopics. Second, we conduct a comprehensive quality control process to achieve high-quality questions and answers, where the reference answers are static and cannot be changed over time. Third, following SimpleQA, the questions and answers are very short, and the grading process is easy-to-evaluate based on OpenAI API. Based on Chinese SimpleQA, we perform a comprehensive evaluation on the factuality abilities of existing LLMs. Finally, we hope that Chinese SimpleQA could guide the developers to better understand the Chinese factuality abilities of their models and facilitate the growth of foundation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07140v2-abstract-full').style.display = 'none'; document.getElementById('2411.07140v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07076">arXiv:2411.07076</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07076">pdf</a>, <a href="https://arxiv.org/format/2411.07076">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StoryTeller: Improving Long Video Description through Global Audio-Visual Character Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yichen He</a>, <a href="/search/?searchtype=author&amp;query=Lin%2C+Y">Yuan Lin</a>, <a href="/search/?searchtype=author&amp;query=Wu%2C+J">Jianchao Wu</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+H">Hanchong Zhang</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+Y">Yuchen Zhang</a>, <a href="/search/?searchtype=author&amp;query=Le%2C+R">Ruicheng Le</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07076v1-abstract-short" style="display: inline;"> Existing large vision-language models (LVLMs) are largely limited to processing short, seconds-long videos and struggle with generating coherent descriptions for extended video spanning minutes or more. Long video description introduces new challenges, such as plot-level consistency across descriptions. To address these, we figure out audio-visual character identification, matching character names&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07076v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07076v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07076v1-abstract-full" style="display: none;"> Existing large vision-language models (LVLMs) are largely limited to processing short, seconds-long videos and struggle with generating coherent descriptions for extended video spanning minutes or more. Long video description introduces new challenges, such as plot-level consistency across descriptions. To address these, we figure out audio-visual character identification, matching character names to each dialogue, as a key factor. We propose StoryTeller, a system for generating dense descriptions of long videos, incorporating both low-level visual concepts and high-level plot information. StoryTeller uses a multimodal large language model that integrates visual, audio, and text modalities to perform audio-visual character identification on minute-long video clips. The results are then fed into a LVLM to enhance consistency of video description. We validate our approach on movie description tasks and introduce MovieStory101, a dataset with dense descriptions for three-minute movie clips. To evaluate long video descriptions, we create MovieQA, a large set of multiple-choice questions for the MovieStory101 test set. We assess descriptions by inputting them into GPT-4 to answer these questions, using accuracy as an automatic evaluation metric. Experiments show that StoryTeller outperforms all open and closed-source baselines on MovieQA, achieving 9.5% higher accuracy than the strongest baseline, Gemini-1.5-pro, and demonstrating a +15.56% advantage in human side-by-side evaluations. Additionally, incorporating audio-visual character identification from StoryTeller improves the performance of all video description models, with Gemini-1.5-pro and GPT-4o showing relative improvement of 5.5% and 13.0%, respectively, in accuracy on MovieQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07076v1-abstract-full').style.display = 'none'; document.getElementById('2411.07076v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06423">arXiv:2411.06423</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06423">pdf</a>, <a href="https://arxiv.org/format/2411.06423">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Generalized Principal Component Analysis for Large-dimensional Matrix Factor Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yong He</a>, <a href="/search/?searchtype=author&amp;query=Hou%2C+Y">Yujie Hou</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+H">Haixia Liu</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+Y">Yalin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06423v1-abstract-short" style="display: inline;"> Matrix factor models have been growing popular dimension reduction tools for large-dimensional matrix time series. However, the heteroscedasticity of the idiosyncratic components has barely received any attention. Starting from the pseudo likelihood function, this paper introduces a Generalized Principal Component Analysis (GPCA) method for matrix factor model which takes the heteroscedasticity in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06423v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06423v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06423v1-abstract-full" style="display: none;"> Matrix factor models have been growing popular dimension reduction tools for large-dimensional matrix time series. However, the heteroscedasticity of the idiosyncratic components has barely received any attention. Starting from the pseudo likelihood function, this paper introduces a Generalized Principal Component Analysis (GPCA) method for matrix factor model which takes the heteroscedasticity into account. Theoretically, we first derive the asymptotic distribution of the GPCA estimators by assuming the separable covariance matrices are known in advance. We then propose adaptive thresholding estimators for the separable covariance matrices and show that this would not alter the asymptotic distribution of the GPCA estimators under certain regular sparsity conditions in the high-dimensional covariance matrix estimation literature. The GPCA estimators are shown to be more efficient than the state-of-the-art methods under certain heteroscedasticity conditions. Thorough numerical studies are conducted to demonstrate the superiority of our method over the existing approaches. Analysis of a financial portfolio dataset illustrates the empirical usefulness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06423v1-abstract-full').style.display = 'none'; document.getElementById('2411.06423v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05878">arXiv:2411.05878</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05878">pdf</a>, <a href="https://arxiv.org/format/2411.05878">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Joint-Optimized Unsupervised Adversarial Domain Adaptation in Remote Sensing Segmentation with Prompted Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Lyu%2C+S">Shuchang Lyu</a>, <a href="/search/?searchtype=author&amp;query=Zhao%2C+Q">Qi Zhao</a>, <a href="/search/?searchtype=author&amp;query=Cheng%2C+G">Guangliang Cheng</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yiwei He</a>, <a href="/search/?searchtype=author&amp;query=Zhou%2C+Z">Zheng Zhou</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+G">Guangbiao Wang</a>, <a href="/search/?searchtype=author&amp;query=Shi%2C+Z">Zhenwei Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05878v2-abstract-short" style="display: inline;"> Unsupervised Domain Adaptation for Remote Sensing Semantic Segmentation (UDA-RSSeg) addresses the challenge of adapting a model trained on source domain data to target domain samples, thereby minimizing the need for annotated data across diverse remote sensing scenes. This task presents two principal challenges: (1) severe inconsistencies in feature representation across different remote sensing d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05878v2-abstract-full').style.display = 'inline'; document.getElementById('2411.05878v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05878v2-abstract-full" style="display: none;"> Unsupervised Domain Adaptation for Remote Sensing Semantic Segmentation (UDA-RSSeg) addresses the challenge of adapting a model trained on source domain data to target domain samples, thereby minimizing the need for annotated data across diverse remote sensing scenes. This task presents two principal challenges: (1) severe inconsistencies in feature representation across different remote sensing domains, and (2) a domain gap that emerges due to the representation bias of source domain patterns when translating features to predictive logits. To tackle these issues, we propose a joint-optimized adversarial network incorporating the &#34;Segment Anything Model (SAM) (SAM-JOANet)&#34; for UDA-RSSeg. Our approach integrates SAM to leverage its robust generalized representation capabilities, thereby alleviating feature inconsistencies. We introduce a finetuning decoder designed to convert SAM-Encoder features into predictive logits. Additionally, a feature-level adversarial-based prompted segmentor is employed to generate class-agnostic maps, which guide the finetuning decoder&#39;s feature representations. The network is optimized end-to-end, combining the prompted segmentor and the finetuning decoder. Extensive evaluations on benchmark datasets, including ISPRS (Potsdam/Vaihingen) and CITY-OSM (Paris/Chicago), demonstrate the effectiveness of our method. The results, supported by visualization and analysis, confirm the method&#39;s interpretability and robustness. The code of this paper is available at https://github.com/CV-ShuchangLyu/SAM-JOANet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05878v2-abstract-full').style.display = 'none'; document.getElementById('2411.05878v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages,6 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05842">arXiv:2411.05842</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05842">pdf</a>, <a href="https://arxiv.org/format/2411.05842">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Robust Freeway Traffic Speed Estimation under Oblique Grid using Vehicle Trajectory Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yang He</a>, <a href="/search/?searchtype=author&amp;query=An%2C+C">Chengchuan An</a>, <a href="/search/?searchtype=author&amp;query=Jia%2C+Y">Yuheng Jia</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+J">Jiachao Liu</a>, <a href="/search/?searchtype=author&amp;query=Lu%2C+Z">Zhenbo Lu</a>, <a href="/search/?searchtype=author&amp;query=Xia%2C+J">Jingxin Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05842v1-abstract-short" style="display: inline;"> Accurately estimating spatiotemporal traffic states on freeways is a significant challenge due to limited sensor deployment and potential data corruption. In this study, we propose an efficient and robust low-rank model for precise spatiotemporal traffic speed state estimation (TSE) using lowpenetration vehicle trajectory data. Leveraging traffic wave priors, an oblique grid-based matrix is first&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05842v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05842v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05842v1-abstract-full" style="display: none;"> Accurately estimating spatiotemporal traffic states on freeways is a significant challenge due to limited sensor deployment and potential data corruption. In this study, we propose an efficient and robust low-rank model for precise spatiotemporal traffic speed state estimation (TSE) using lowpenetration vehicle trajectory data. Leveraging traffic wave priors, an oblique grid-based matrix is first designed to transform the inherent dependencies of spatiotemporal traffic states into the algebraic low-rankness of a matrix. Then, with the enhanced traffic state low-rankness in the oblique matrix, a low-rank matrix completion method is tailored to explicitly capture spatiotemporal traffic propagation characteristics and precisely reconstruct traffic states. In addition, an anomaly-tolerant module based on a sparse matrix is developed to accommodate corrupted data input and thereby improve the TSE model robustness. Notably, driven by the understanding of traffic waves, the computational complexity of the proposed efficient method is only correlated with the problem size itself, not with dataset size and hyperparameter selection prevalent in existing studies. Extensive experiments demonstrate the effectiveness, robustness, and efficiency of the proposed model. The performance of the proposed method achieves up to a 12% improvement in Root Mean Squared Error (RMSE) in the TSE scenarios and an 18% improvement in RMSE in the robust TSE scenarios, and it runs more than 20 times faster than the state-of-the-art (SOTA) methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05842v1-abstract-full').style.display = 'none'; document.getElementById('2411.05842v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by T-ITS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05738">arXiv:2411.05738</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05738">pdf</a>, <a href="https://arxiv.org/format/2411.05738">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> StdGEN: Semantic-Decomposed 3D Character Generation from Single Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yuze He</a>, <a href="/search/?searchtype=author&amp;query=Zhou%2C+Y">Yanning Zhou</a>, <a href="/search/?searchtype=author&amp;query=Zhao%2C+W">Wang Zhao</a>, <a href="/search/?searchtype=author&amp;query=Wu%2C+Z">Zhongkai Wu</a>, <a href="/search/?searchtype=author&amp;query=Xiao%2C+K">Kaiwen Xiao</a>, <a href="/search/?searchtype=author&amp;query=Yang%2C+W">Wei Yang</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+Y">Yong-Jin Liu</a>, <a href="/search/?searchtype=author&amp;query=Han%2C+X">Xiao Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05738v1-abstract-short" style="display: inline;"> We present StdGEN, an innovative pipeline for generating semantically decomposed high-quality 3D characters from single images, enabling broad applications in virtual reality, gaming, and filmmaking, etc. Unlike previous methods which struggle with limited decomposability, unsatisfactory quality, and long optimization times, StdGEN features decomposability, effectiveness and efficiency; i.e., it g&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05738v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05738v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05738v1-abstract-full" style="display: none;"> We present StdGEN, an innovative pipeline for generating semantically decomposed high-quality 3D characters from single images, enabling broad applications in virtual reality, gaming, and filmmaking, etc. Unlike previous methods which struggle with limited decomposability, unsatisfactory quality, and long optimization times, StdGEN features decomposability, effectiveness and efficiency; i.e., it generates intricately detailed 3D characters with separated semantic components such as the body, clothes, and hair, in three minutes. At the core of StdGEN is our proposed Semantic-aware Large Reconstruction Model (S-LRM), a transformer-based generalizable model that jointly reconstructs geometry, color and semantics from multi-view images in a feed-forward manner. A differentiable multi-layer semantic surface extraction scheme is introduced to acquire meshes from hybrid implicit fields reconstructed by our S-LRM. Additionally, a specialized efficient multi-view diffusion model and an iterative multi-layer surface refinement module are integrated into the pipeline to facilitate high-quality, decomposable 3D character generation. Extensive experiments demonstrate our state-of-the-art performance in 3D anime character generation, surpassing existing baselines by a significant margin in geometry, texture and decomposability. StdGEN offers ready-to-use semantic-decomposed 3D characters and enables flexible customization for a wide range of applications. Project page: https://stdgen.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05738v1-abstract-full').style.display = 'none'; document.getElementById('2411.05738v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05362">arXiv:2411.05362</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05362">pdf</a>, <a href="https://arxiv.org/format/2411.05362">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From Transparent to Opaque: Rethinking Neural Implicit Surfaces with $伪$-NeuS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Zhang%2C+H">Haoran Zhang</a>, <a href="/search/?searchtype=author&amp;query=Deng%2C+J">Junkai Deng</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+X">Xuhui Chen</a>, <a href="/search/?searchtype=author&amp;query=Hou%2C+F">Fei Hou</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+W">Wencheng Wang</a>, <a href="/search/?searchtype=author&amp;query=Qin%2C+H">Hong Qin</a>, <a href="/search/?searchtype=author&amp;query=Qian%2C+C">Chen Qian</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Ying He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05362v1-abstract-short" style="display: inline;"> Traditional 3D shape reconstruction techniques from multi-view images, such as structure from motion and multi-view stereo, primarily focus on opaque surfaces. Similarly, recent advances in neural radiance fields and its variants also primarily address opaque objects, encountering difficulties with the complex lighting effects caused by transparent materials. This paper introduces $伪$-NeuS, a new&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05362v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05362v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05362v1-abstract-full" style="display: none;"> Traditional 3D shape reconstruction techniques from multi-view images, such as structure from motion and multi-view stereo, primarily focus on opaque surfaces. Similarly, recent advances in neural radiance fields and its variants also primarily address opaque objects, encountering difficulties with the complex lighting effects caused by transparent materials. This paper introduces $伪$-NeuS, a new method for simultaneously reconstructing thin transparent objects and opaque objects based on neural implicit surfaces (NeuS). Our method leverages the observation that transparent surfaces induce local extreme values in the learned distance fields during neural volumetric rendering, contrasting with opaque surfaces that align with zero level sets. Traditional iso-surfacing algorithms such as marching cubes, which rely on fixed iso-values, are ill-suited for this data. We address this by taking the absolute value of the distance field and developing an optimization method that extracts level sets corresponding to both non-negative local minima and zero iso-values. We prove that the reconstructed surfaces are unbiased for both transparent and opaque objects. To validate our approach, we construct a benchmark that includes both real-world and synthetic scenes, demonstrating its practical utility and effectiveness. Our data and code are publicly available at https://github.com/728388808/alpha-NeuS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05362v1-abstract-full').style.display = 'none'; document.getElementById('2411.05362v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NeurIPS 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05332">arXiv:2411.05332</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05332">pdf</a>, <a href="https://arxiv.org/ps/2411.05332">ps</a>, <a href="https://arxiv.org/format/2411.05332">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Sparse Principal Component Analysis with Non-Oblivious Adversarial Perturbations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yuqing He</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+G">Guanyi Wang</a>, <a href="/search/?searchtype=author&amp;query=Yang%2C+Y">Yu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05332v1-abstract-short" style="display: inline;"> Sparse Principal Component Analysis (sparse PCA) is a fundamental dimension-reduction tool that enhances interpretability in various high-dimensional settings. An important variant of sparse PCA studies the scenario when samples are adversarially perturbed. Notably, most existing statistical studies on this variant focus on recovering the ground truth and verifying the robustness of classical algo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05332v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05332v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05332v1-abstract-full" style="display: none;"> Sparse Principal Component Analysis (sparse PCA) is a fundamental dimension-reduction tool that enhances interpretability in various high-dimensional settings. An important variant of sparse PCA studies the scenario when samples are adversarially perturbed. Notably, most existing statistical studies on this variant focus on recovering the ground truth and verifying the robustness of classical algorithms when the given samples are corrupted under oblivious adversarial perturbations. In contrast, this paper aims to find a robust sparse principal component that maximizes the variance of the given samples corrupted by non-oblivious adversarial perturbations, say sparse PCA with Non-Oblivious Adversarial Perturbations (sparse PCA-NOAP). Specifically, we introduce a general formulation for the proposed sparse PCA-NOAP. We then derive Mixed-Integer Programming (MIP) reformulations to upper bound it with provable worst-case guarantees when adversarial perturbations are controlled by two typical norms, i.e., $\ell_{2 \rightarrow \infty}$-norm (sample-wise $\ell_2$-norm perturbation) and $\ell_{1 \rightarrow 2}$-norm (feature-wise $\ell_2$-norm perturbation). Moreover, when samples are drawn from the spiked Wishart model, we show that the proposed MIP reformulations ensure vector recovery properties under a more general parameter region compared with existing results. Numerical simulations are also provided to validate the theoretical findings and demonstrate the accuracy of the proposed formulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05332v1-abstract-full').style.display = 'none'; document.getElementById('2411.05332v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04952">arXiv:2411.04952</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04952">pdf</a>, <a href="https://arxiv.org/format/2411.04952">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> M3DocRAG: Multi-modal Retrieval is What You Need for Multi-page Multi-document Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Cho%2C+J">Jaemin Cho</a>, <a href="/search/?searchtype=author&amp;query=Mahata%2C+D">Debanjan Mahata</a>, <a href="/search/?searchtype=author&amp;query=Irsoy%2C+O">Ozan Irsoy</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yujie He</a>, <a href="/search/?searchtype=author&amp;query=Bansal%2C+M">Mohit Bansal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04952v1-abstract-short" style="display: inline;"> Document visual question answering (DocVQA) pipelines that answer questions from documents have broad applications. Existing methods focus on handling single-page documents with multi-modal language models (MLMs), or rely on text-based retrieval-augmented generation (RAG) that uses text extraction tools such as optical character recognition (OCR). However, there are difficulties in applying these&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04952v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04952v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04952v1-abstract-full" style="display: none;"> Document visual question answering (DocVQA) pipelines that answer questions from documents have broad applications. Existing methods focus on handling single-page documents with multi-modal language models (MLMs), or rely on text-based retrieval-augmented generation (RAG) that uses text extraction tools such as optical character recognition (OCR). However, there are difficulties in applying these methods in real-world scenarios: (a) questions often require information across different pages or documents, where MLMs cannot handle many long documents; (b) documents often have important information in visual elements such as figures, but text extraction tools ignore them. We introduce M3DocRAG, a novel multi-modal RAG framework that flexibly accommodates various document contexts (closed-domain and open-domain), question hops (single-hop and multi-hop), and evidence modalities (text, chart, figure, etc.). M3DocRAG finds relevant documents and answers questions using a multi-modal retriever and an MLM, so that it can efficiently handle single or many documents while preserving visual information. Since previous DocVQA datasets ask questions in the context of a specific document, we also present M3DocVQA, a new benchmark for evaluating open-domain DocVQA over 3,000+ PDF documents with 40,000+ pages. In three benchmarks (M3DocVQA/MMLongBench-Doc/MP-DocVQA), empirical results show that M3DocRAG with ColPali and Qwen2-VL 7B achieves superior performance than many strong baselines, including state-of-the-art performance in MP-DocVQA. We provide comprehensive analyses of different indexing, MLMs, and retrieval models. Lastly, we qualitatively show that M3DocRAG can successfully handle various scenarios, such as when relevant information exists across multiple pages and when answer evidence only exists in images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04952v1-abstract-full').style.display = 'none'; document.getElementById('2411.04952v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project webpage: https://m3docrag.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04919">arXiv:2411.04919</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04919">pdf</a>, <a href="https://arxiv.org/format/2411.04919">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stem-OB: Generalizable Visual Imitation Learning with Stem-Like Convergent Observation through Diffusion Inversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Hu%2C+K">Kaizhe Hu</a>, <a href="/search/?searchtype=author&amp;query=Rui%2C+Z">Zihang Rui</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yao He</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+Y">Yuyao Liu</a>, <a href="/search/?searchtype=author&amp;query=Hua%2C+P">Pu Hua</a>, <a href="/search/?searchtype=author&amp;query=Xu%2C+H">Huazhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04919v2-abstract-short" style="display: inline;"> Visual imitation learning methods demonstrate strong performance, yet they lack generalization when faced with visual input perturbations, including variations in lighting and textures, impeding their real-world application. We propose Stem-OB that utilizes pretrained image diffusion models to suppress low-level visual differences while maintaining high-level scene structures. This image inversion&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04919v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04919v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04919v2-abstract-full" style="display: none;"> Visual imitation learning methods demonstrate strong performance, yet they lack generalization when faced with visual input perturbations, including variations in lighting and textures, impeding their real-world application. We propose Stem-OB that utilizes pretrained image diffusion models to suppress low-level visual differences while maintaining high-level scene structures. This image inversion process is akin to transforming the observation into a shared representation, from which other observations stem, with extraneous details removed. Stem-OB contrasts with data-augmentation approaches as it is robust to various unspecified appearance changes without the need for additional training. Our method is a simple yet highly effective plug-and-play solution. Empirical results confirm the effectiveness of our approach in simulated tasks and show an exceptionally significant improvement in real-world applications, with an average increase of 22.2% in success rates compared to the best baseline. See https://hukz18.github.io/Stem-Ob/ for more info. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04919v2-abstract-full').style.display = 'none'; document.getElementById('2411.04919v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Arxiv preprint version, website: https://hukz18.github.io/Stem-Ob/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03752">arXiv:2411.03752</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03752">pdf</a>, <a href="https://arxiv.org/format/2411.03752">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deferred Poisoning: Making the Model More Vulnerable via Hessian Singularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yuhao He</a>, <a href="/search/?searchtype=author&amp;query=Tian%2C+J">Jinyu Tian</a>, <a href="/search/?searchtype=author&amp;query=Zheng%2C+X">Xianwei Zheng</a>, <a href="/search/?searchtype=author&amp;query=Dong%2C+L">Li Dong</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+Y">Yuanman Li</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+L+Y">Leo Yu Zhang</a>, <a href="/search/?searchtype=author&amp;query=Zhou%2C+J">Jiantao Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03752v1-abstract-short" style="display: inline;"> Recent studies have shown that deep learning models are very vulnerable to poisoning attacks. Many defense methods have been proposed to address this issue. However, traditional poisoning attacks are not as threatening as commonly believed. This is because they often cause differences in how the model performs on the training set compared to the validation set. Such inconsistency can alert defende&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03752v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03752v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03752v1-abstract-full" style="display: none;"> Recent studies have shown that deep learning models are very vulnerable to poisoning attacks. Many defense methods have been proposed to address this issue. However, traditional poisoning attacks are not as threatening as commonly believed. This is because they often cause differences in how the model performs on the training set compared to the validation set. Such inconsistency can alert defenders that their data has been poisoned, allowing them to take the necessary defensive actions. In this paper, we introduce a more threatening type of poisoning attack called the Deferred Poisoning Attack. This new attack allows the model to function normally during the training and validation phases but makes it very sensitive to evasion attacks or even natural noise. We achieve this by ensuring the poisoned model&#39;s loss function has a similar value as a normally trained model at each input sample but with a large local curvature. A similar model loss ensures that there is no obvious inconsistency between the training and validation accuracy, demonstrating high stealthiness. On the other hand, the large curvature implies that a small perturbation may cause a significant increase in model loss, leading to substantial performance degradation, which reflects a worse robustness. We fulfill this purpose by making the model have singular Hessian information at the optimal point via our proposed Singularization Regularization term. We have conducted both theoretical and empirical analyses of the proposed method and validated its effectiveness through experiments on image classification tasks. Furthermore, we have confirmed the hazards of this form of poisoning attack under more general scenarios using natural noise, offering a new perspective for research in the field of security. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03752v1-abstract-full').style.display = 'none'; document.getElementById('2411.03752v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01916">arXiv:2411.01916</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01916">pdf</a>, <a href="https://arxiv.org/format/2411.01916">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Masked Autoencoders are Parameter-Efficient Federated Continual Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yuchen He</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+X">Xiangfeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01916v3-abstract-short" style="display: inline;"> Federated learning is a specific distributed learning paradigm in which a central server aggregates updates from multiple clients&#39; local models, thereby enabling the server to learn without requiring clients to upload their private data, maintaining data privacy. While existing federated learning methods are primarily designed for static data, real-world applications often require clients to learn&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01916v3-abstract-full').style.display = 'inline'; document.getElementById('2411.01916v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01916v3-abstract-full" style="display: none;"> Federated learning is a specific distributed learning paradigm in which a central server aggregates updates from multiple clients&#39; local models, thereby enabling the server to learn without requiring clients to upload their private data, maintaining data privacy. While existing federated learning methods are primarily designed for static data, real-world applications often require clients to learn new categories over time. This challenge necessitates the integration of continual learning techniques, leading to federated continual learning (FCL). To address both catastrophic forgetting and non-IID issues, we propose to use masked autoencoders (MAEs) as parameter-efficient federated continual learners, called pMAE. pMAE learns reconstructive prompt on the client side through image reconstruction using MAE. On the server side, it reconstructs the uploaded restore information to capture the data distribution across previous tasks and different clients, using these reconstructed images to fine-tune discriminative prompt and classifier parameters tailored for classification, thereby alleviating catastrophic forgetting and non-IID issues on a global scale. Experimental results demonstrate that pMAE achieves performance comparable to existing prompt-based methods and can enhance their effectiveness, particularly when using self-supervised pre-trained transformers as the backbone. Code is available at: https://github.com/ycheoo/pMAE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01916v3-abstract-full').style.display = 'none'; document.getElementById('2411.01916v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01904">arXiv:2411.01904</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01904">pdf</a>, <a href="https://arxiv.org/format/2411.01904">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FPPL: An Efficient and Non-IID Robust Federated Continual Learning Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yuchen He</a>, <a href="/search/?searchtype=author&amp;query=Shen%2C+C">Chuyun Shen</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+X">Xiangfeng Wang</a>, <a href="/search/?searchtype=author&amp;query=Jin%2C+B">Bo Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01904v2-abstract-short" style="display: inline;"> Federated continual learning (FCL) aims to learn from sequential data stream in the decentralized federated learning setting, while simultaneously mitigating the catastrophic forgetting issue in classical continual learning. Existing FCL methods usually employ typical rehearsal mechanisms, which could result in privacy violations or additional onerous storage and computational burdens. In this wor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01904v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01904v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01904v2-abstract-full" style="display: none;"> Federated continual learning (FCL) aims to learn from sequential data stream in the decentralized federated learning setting, while simultaneously mitigating the catastrophic forgetting issue in classical continual learning. Existing FCL methods usually employ typical rehearsal mechanisms, which could result in privacy violations or additional onerous storage and computational burdens. In this work, an efficient and non-IID robust federated continual learning framework, called Federated Prototype-Augmented Prompt Learning (FPPL), is proposed. The FPPL can collaboratively learn lightweight prompts augmented by prototypes without rehearsal. On the client side, a fusion function is employed to fully leverage the knowledge contained in task-specific prompts for alleviating catastrophic forgetting. Additionally, global prototypes aggregated from the server are used to obtain unified representation through contrastive learning, mitigating the impact of non-IID-derived data heterogeneity. On the server side, locally uploaded prototypes are utilized to perform debiasing on the classifier, further alleviating the performance degradation caused by both non-IID and catastrophic forgetting. Empirical evaluations demonstrate the effectiveness of FPPL, achieving notable performance with an efficient design while remaining robust to diverse non-IID degrees. Code is available at: https://github.com/ycheoo/FPPL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01904v2-abstract-full').style.display = 'none'; document.getElementById('2411.01904v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01488">arXiv:2411.01488</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01488">pdf</a>, <a href="https://arxiv.org/format/2411.01488">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> ITS: Implicit Thin Shell for Polygonal Meshes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Wen%2C+H">Huibiao Wen</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+L">Lei Wang</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+Y">Yunxiao Zhang</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+S">Shuangmin Chen</a>, <a href="/search/?searchtype=author&amp;query=Xin%2C+S">Shiqing Xin</a>, <a href="/search/?searchtype=author&amp;query=Deng%2C+C">Chongyang Deng</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Ying He</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+W">Wenping Wang</a>, <a href="/search/?searchtype=author&amp;query=Tu%2C+C">Changhe Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01488v1-abstract-short" style="display: inline;"> In computer graphics, simplifying a polygonal mesh surface~$\mathcal{M}$ into a geometric proxy that maintains close conformity to~$\mathcal{M}$ is crucial, as it can significantly reduce computational demands in various applications. In this paper, we introduce the Implicit Thin Shell~(ITS), a concept designed to implicitly represent the sandwich-walled space surrounding~$\mathcal{M}$, defined as&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01488v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01488v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01488v1-abstract-full" style="display: none;"> In computer graphics, simplifying a polygonal mesh surface~$\mathcal{M}$ into a geometric proxy that maintains close conformity to~$\mathcal{M}$ is crucial, as it can significantly reduce computational demands in various applications. In this paper, we introduce the Implicit Thin Shell~(ITS), a concept designed to implicitly represent the sandwich-walled space surrounding~$\mathcal{M}$, defined as~$\{\textbf{x}\in\mathbb{R}^3|蔚_1\leq f(\textbf{x}) \leq 蔚_2, 蔚_1&lt; 0, 蔚_2&gt;0\}$. Here, $f$ is an approximation of the signed distance function~(SDF) of~$\mathcal{M}$, and we aim to minimize the thickness~$蔚_2-蔚_1$. To achieve a balance between mathematical simplicity and expressive capability in~$f$, we employ a tri-variate tensor-product B-spline to represent~$f$. This representation is coupled with adaptive knot grids that adapt to the inherent shape variations of~$\mathcal{M}$, while restricting~$f$&#39;s basis functions to the first degree. In this manner, the analytical form of~$f$ can be rapidly determined by solving a sparse linear system. Moreover, the process of identifying the extreme values of~$f$ among the infinitely many points on~$\mathcal{M}$ can be simplified to seeking extremes among a finite set of candidate points. By exhausting the candidate points, we find the extreme values~$蔚_1&lt;0$ and $蔚_2&gt;0$ that minimize the thickness. The constructed ITS is guaranteed to wrap~$\mathcal{M}$ rigorously, without any intersections between the bounding surfaces and~$\mathcal{M}$. ITS offers numerous potential applications thanks to its rigorousness, tightness, expressiveness, and computational efficiency. We demonstrate the efficacy of ITS in rapid inside-outside tests and in mesh simplification through the control of global error. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01488v1-abstract-full').style.display = 'none'; document.getElementById('2411.01488v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01460">arXiv:2411.01460</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01460">pdf</a>, <a href="https://arxiv.org/format/2411.01460">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Mao: Machine learning approach for NUMA optimization in Warehouse Scale Computers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Liu%2C+Y">Yueji Liu</a>, <a href="/search/?searchtype=author&amp;query=Jin%2C+J">Jun Jin</a>, <a href="/search/?searchtype=author&amp;query=Shu%2C+W">Wenhui Shu</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+S">Shiyong Li</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yongzhan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01460v1-abstract-short" style="display: inline;"> Non-Uniform Memory Access (NUMA) architecture imposes numerous performance challenges to today&#39;s cloud workloads. Due to the complexity and the massive scale of modern warehouse-scale computers (WSCs), a lot of efforts need to be done to improve the memory access locality on the NUMA architecture. In Baidu, we have found that NUMA optimization has significant performance benefit to the major workl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01460v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01460v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01460v1-abstract-full" style="display: none;"> Non-Uniform Memory Access (NUMA) architecture imposes numerous performance challenges to today&#39;s cloud workloads. Due to the complexity and the massive scale of modern warehouse-scale computers (WSCs), a lot of efforts need to be done to improve the memory access locality on the NUMA architecture. In Baidu, we have found that NUMA optimization has significant performance benefit to the major workloads like Search and Feed (Baidu&#39;s recommendation system). But how to conduct NUMA optimization within the large scale cluster brings a lot of subtle complexities and workload-specific scenario optimizations. In this paper, we will present a production environment deployed solution in Baidu called MAP (Memory Access Optimizer) that helps improve the memory access locality for Baidu&#39;s various workloads. MAO includes an online module and an offline module. The online module is responsible for the online monitoring, dynamic NUMA node binding and runtime optimization. Meanwhile the offline workload characterization module will proceed with the data analysis and resource-sensitivity module training. We also proposed a new performance model called &#34;NUMA Sensitivity model&#34; to address the impact of remote memory access to workload performance and projection of the potential performance improvements via NUMA optimization for a specific workload. Based on continuous data collected from online monitoring, this model is proved to be working properly in MAO. As of today, we have successfully deployed MAO to more than one hundred thousand servers. In our Feed product, we have achieved 12.1% average latency improvements and 9.8% CPU resource saving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01460v1-abstract-full').style.display = 'none'; document.getElementById('2411.01460v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01236">arXiv:2411.01236</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01236">pdf</a>, <a href="https://arxiv.org/format/2411.01236">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AutoPT: How Far Are We from the End2End Automated Web Penetration Testing? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Wu%2C+B">Benlong Wu</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+G">Guoqiang Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+K">Kejiang Chen</a>, <a href="/search/?searchtype=author&amp;query=Shang%2C+X">Xiuwei Shang</a>, <a href="/search/?searchtype=author&amp;query=Han%2C+J">Jiapeng Han</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yanru He</a>, <a href="/search/?searchtype=author&amp;query=Zhang%2C+W">Weiming Zhang</a>, <a href="/search/?searchtype=author&amp;query=Yu%2C+N">Nenghai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01236v1-abstract-short" style="display: inline;"> Penetration testing is essential to ensure Web security, which can detect and fix vulnerabilities in advance, and prevent data leakage and serious consequences. The powerful inference capabilities of large language models (LLMs) have made significant progress in various fields, and the development potential of LLM-based agents can revolutionize the cybersecurity penetration testing industry. In th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01236v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01236v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01236v1-abstract-full" style="display: none;"> Penetration testing is essential to ensure Web security, which can detect and fix vulnerabilities in advance, and prevent data leakage and serious consequences. The powerful inference capabilities of large language models (LLMs) have made significant progress in various fields, and the development potential of LLM-based agents can revolutionize the cybersecurity penetration testing industry. In this work, we establish a comprehensive end-to-end penetration testing benchmark using a real-world penetration testing environment to explore the capabilities of LLM-based agents in this domain. Our results reveal that the agents are familiar with the framework of penetration testing tasks, but they still face limitations in generating accurate commands and executing complete processes. Accordingly, we summarize the current challenges, including the difficulty of maintaining the entire message history and the tendency for the agent to become stuck. Based on the above insights, we propose a Penetration testing State Machine (PSM) that utilizes the Finite State Machine (FSM) methodology to address these limitations. Then, we introduce AutoPT, an automated penetration testing agent based on the principle of PSM driven by LLMs, which utilizes the inherent inference ability of LLM and the constraint framework of state machines. Our evaluation results show that AutoPT outperforms the baseline framework ReAct on the GPT-4o mini model and improves the task completion rate from 22% to 41% on the benchmark target. Compared with the baseline framework and manual work, AutoPT also reduces time and economic costs further. Hence, our AutoPT has facilitated the development of automated penetration testing and significantly impacted both academia and industry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01236v1-abstract-full').style.display = 'none'; document.getElementById('2411.01236v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01215">arXiv:2411.01215</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01215">pdf</a>, <a href="https://arxiv.org/format/2411.01215">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="High Energy Astrophysical Phenomena">astro-ph.HE</span> </div> </div> <p class="title is-5 mathjax"> Detection of two TeV gamma-ray outbursts from NGC 1275 by LHAASO </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Cao%2C+Z">Zhen Cao</a>, <a href="/search/?searchtype=author&amp;query=Aharonian%2C+F">F. Aharonian</a>, <a href="/search/?searchtype=author&amp;query=Axikegu"> Axikegu</a>, <a href="/search/?searchtype=author&amp;query=Bai%2C+Y+X">Y. X. Bai</a>, <a href="/search/?searchtype=author&amp;query=Bao%2C+Y+W">Y. W. Bao</a>, <a href="/search/?searchtype=author&amp;query=Bastieri%2C+D">D. Bastieri</a>, <a href="/search/?searchtype=author&amp;query=Bi%2C+X+J">X. J. Bi</a>, <a href="/search/?searchtype=author&amp;query=Bi%2C+Y+J">Y. J. Bi</a>, <a href="/search/?searchtype=author&amp;query=Cai%2C+J+T">J. T. Cai</a>, <a href="/search/?searchtype=author&amp;query=Cao%2C+Q">Q. Cao</a>, <a href="/search/?searchtype=author&amp;query=Cao%2C+W+Y">W. Y. Cao</a>, <a href="/search/?searchtype=author&amp;query=Cao%2C+Z">Zhe Cao</a>, <a href="/search/?searchtype=author&amp;query=Chang%2C+J">J. Chang</a>, <a href="/search/?searchtype=author&amp;query=Chang%2C+J+F">J. F. Chang</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+A+M">A. M. Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+E+S">E. S. Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+L">Liang Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+L">Lin Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+L">Long Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+M+J">M. J. Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+M+L">M. L. Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+Q+H">Q. H. Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+S+H">S. H. Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+S+Z">S. Z. Chen</a>, <a href="/search/?searchtype=author&amp;query=Chen%2C+T+L">T. L. Chen</a> , et al. (254 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01215v2-abstract-short" style="display: inline;"> The Water Cherenkov Detector Array (WCDA) is one of the components of Large High Altitude Air Shower Observatory (LHAASO) and can monitor any sources over two-thirds of the sky for up to 7 hours per day with &gt;98\% duty cycle. In this work, we report the detection of two outbursts of the Fanaroff-Riley I radio galaxy NGC 1275 that were detected by LHAASO-WCDA between November 2022 and January 2023&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01215v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01215v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01215v2-abstract-full" style="display: none;"> The Water Cherenkov Detector Array (WCDA) is one of the components of Large High Altitude Air Shower Observatory (LHAASO) and can monitor any sources over two-thirds of the sky for up to 7 hours per day with &gt;98\% duty cycle. In this work, we report the detection of two outbursts of the Fanaroff-Riley I radio galaxy NGC 1275 that were detected by LHAASO-WCDA between November 2022 and January 2023 with statistical significance of 5.2~$蟽$ and 8.3~$蟽$. The observed spectral energy distribution in the range from 500 GeV to 3 TeV is fitted by a power-law with a best-fit spectral index of $伪=-3.37\pm0.52$ and $-3.35\pm0.29$, respectively. The outburst flux above 0.5~TeV was ($4.55\pm 4.21)\times~10^{-11}~\rm cm^{-2}~s^{-1}$ and ($3.45\pm 1.78)\times~10^{-11}~\rm cm^{-2}~s^{-1}$, corresponding to 60\%, 45\% of Crab Nebula flux. Variation analysis reveals the variability time-scale of days at the TeV energy band. A simple test by one-zone synchrotron self-Compton model reproduces the data in the gamma-ray band well. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01215v2-abstract-full').style.display = 'none'; document.getElementById('2411.01215v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 8 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00845">arXiv:2411.00845</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00845">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> End-to-end Graph Learning Approach for Cognitive Diagnosis of Student Tutorial </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Yang%2C+F">Fulai Yang</a>, <a href="/search/?searchtype=author&amp;query=Wu%2C+D">Di Wu</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yi He</a>, <a href="/search/?searchtype=author&amp;query=Tao%2C+L">Li Tao</a>, <a href="/search/?searchtype=author&amp;query=Luo%2C+X">Xin Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00845v1-abstract-short" style="display: inline;"> Cognitive diagnosis (CD) utilizes students&#39; existing studying records to estimate their mastery of unknown knowledge concepts, which is vital for evaluating their learning abilities. Accurate CD is extremely challenging because CD is associated with complex relationships and mechanisms among students, knowledge concepts, studying records, etc. However, existing approaches loosely consider these re&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00845v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00845v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00845v1-abstract-full" style="display: none;"> Cognitive diagnosis (CD) utilizes students&#39; existing studying records to estimate their mastery of unknown knowledge concepts, which is vital for evaluating their learning abilities. Accurate CD is extremely challenging because CD is associated with complex relationships and mechanisms among students, knowledge concepts, studying records, etc. However, existing approaches loosely consider these relationships and mechanisms by a non-end-to-end learning framework, resulting in sub-optimal feature extractions and fusions for CD. Different from them, this paper innovatively proposes an End-to-end Graph Neural Networks-based Cognitive Diagnosis (EGNN-CD) model. EGNN-CD consists of three main parts: knowledge concept network (KCN), graph neural networks-based feature extraction (GNNFE), and cognitive ability prediction (CAP). First, KCN constructs CD-related interaction by comprehensively extracting physical information from students, exercises, and knowledge concepts. Second, a four-channel GNNFE is designed to extract high-order and individual features from the constructed KCN. Finally, CAP employs a multi-layer perceptron to fuse the extracted features to predict students&#39; learning abilities in an end-to-end learning way. With such designs, the feature extractions and fusions are guaranteed to be comprehensive and optimal for CD. Extensive experiments on three real datasets demonstrate that our EGNN-CD achieves significantly higher accuracy than state-of-the-art models in CD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00845v1-abstract-full').style.display = 'none'; document.getElementById('2411.00845v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24218">arXiv:2410.24218</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.24218">pdf</a>, <a href="https://arxiv.org/format/2410.24218">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Teaching Embodied Reinforcement Learning Agents: Informativeness and Diversity of Language Use </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Xi%2C+J">Jiajun Xi</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yinong He</a>, <a href="/search/?searchtype=author&amp;query=Yang%2C+J">Jianing Yang</a>, <a href="/search/?searchtype=author&amp;query=Dai%2C+Y">Yinpei Dai</a>, <a href="/search/?searchtype=author&amp;query=Chai%2C+J">Joyce Chai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24218v1-abstract-short" style="display: inline;"> In real-world scenarios, it is desirable for embodied agents to have the ability to leverage human language to gain explicit or implicit knowledge for learning tasks. Despite recent progress, most previous approaches adopt simple low-level instructions as language inputs, which may not reflect natural human communication. It&#39;s not clear how to incorporate rich language use to facilitate task learn&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24218v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24218v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24218v1-abstract-full" style="display: none;"> In real-world scenarios, it is desirable for embodied agents to have the ability to leverage human language to gain explicit or implicit knowledge for learning tasks. Despite recent progress, most previous approaches adopt simple low-level instructions as language inputs, which may not reflect natural human communication. It&#39;s not clear how to incorporate rich language use to facilitate task learning. To address this question, this paper studies different types of language inputs in facilitating reinforcement learning (RL) embodied agents. More specifically, we examine how different levels of language informativeness (i.e., feedback on past behaviors and future guidance) and diversity (i.e., variation of language expressions) impact agent learning and inference. Our empirical results based on four RL benchmarks demonstrate that agents trained with diverse and informative language feedback can achieve enhanced generalization and fast adaptation to new tasks. These findings highlight the pivotal role of language use in teaching embodied agents new tasks in an open world. Project website: https://github.com/sled-group/Teachable_RL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24218v1-abstract-full').style.display = 'none'; document.getElementById('2410.24218v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Main. Project website: https://github.com/sled-group/Teachable_RL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23610">arXiv:2410.23610</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23610">pdf</a>, <a href="https://arxiv.org/format/2410.23610">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Global Convergence in Training Large-Scale Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=Gao%2C+C">Cheng Gao</a>, <a href="/search/?searchtype=author&amp;query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/?searchtype=author&amp;query=Li%2C+Z">Zihao Li</a>, <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yihan He</a>, <a href="/search/?searchtype=author&amp;query=Wang%2C+M">Mengdi Wang</a>, <a href="/search/?searchtype=author&amp;query=Liu%2C+H">Han Liu</a>, <a href="/search/?searchtype=author&amp;query=Klusowski%2C+J+M">Jason Matthew Klusowski</a>, <a href="/search/?searchtype=author&amp;query=Fan%2C+J">Jianqing Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23610v1-abstract-short" style="display: inline;"> Despite the widespread success of Transformers across various domains, their optimization guarantees in large-scale model settings are not well-understood. This paper rigorously analyzes the convergence properties of gradient flow in training Transformers with weight decay regularization. First, we construct the mean-field limit of large-scale Transformers, showing that as the model width and dept&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23610v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23610v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23610v1-abstract-full" style="display: none;"> Despite the widespread success of Transformers across various domains, their optimization guarantees in large-scale model settings are not well-understood. This paper rigorously analyzes the convergence properties of gradient flow in training Transformers with weight decay regularization. First, we construct the mean-field limit of large-scale Transformers, showing that as the model width and depth go to infinity, gradient flow converges to the Wasserstein gradient flow, which is represented by a partial differential equation. Then, we demonstrate that the gradient flow reaches a global minimum consistent with the PDE solution when the weight decay regularization parameter is sufficiently small. Our analysis is based on a series of novel mean-field techniques that adapt to Transformers. Compared with existing tools for deep networks (Lu et al., 2020) that demand homogeneity and global Lipschitz smoothness, we utilize a refined analysis assuming only $\textit{partial homogeneity}$ and $\textit{local Lipschitz smoothness}$. These new techniques may be of independent interest. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23610v1-abstract-full').style.display = 'none'; document.getElementById('2410.23610v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to be published in 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 35Q93 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22105">arXiv:2410.22105</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22105">pdf</a>, <a href="https://arxiv.org/format/2410.22105">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DAGE: DAG Query Answering via Relational Combinator with Logical Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&amp;query=He%2C+Y">Yunjie He</a>, <a href="/search/?searchtype=author&amp;query=Xiong%2C+B">Bo Xiong</a>, <a href="/search/?searchtype=author&amp;query=Hern%C3%A1ndez%2C+D">Daniel Hern谩ndez</a>, <a href="/search/?searchtype=author&amp;query=Zhu%2C+Y">Yuqicheng Zhu</a>, <a href="/search/?searchtype=author&amp;query=Kharlamov%2C+E">Evgeny Kharlamov</a>, <a href="/search/?searchtype=author&amp;query=Staab%2C+S">Steffen Staab</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22105v1-abstract-short" style="display: inline;"> Predicting answers to queries over knowledge graphs is called a complex reasoning task because answering a query requires subdividing it into subqueries. Existing query embedding methods use this decomposition to compute the embedding of a query as the combination of the embedding of the subqueries. This requirement limits the answerable queries to queries having a single free variable and being d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22105v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22105v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22105v1-abstract-full" style="display: none;"> Predicting answers to queries over knowledge graphs is called a complex reasoning task because answering a query requires subdividing it into subqueries. Existing query embedding methods use this decomposition to compute the embedding of a query as the combination of the embedding of the subqueries. This requirement limits the answerable queries to queries having a single free variable and being decomposable, which are called tree-form queries and correspond to the $\mathcal{SROI}^-$ description logic. In this paper, we define a more general set of queries, called DAG queries and formulated in the $\mathcal{ALCOIR}$ description logic, propose a query embedding method for them, called DAGE, and a new benchmark to evaluate query embeddings on them. Given the computational graph of a DAG query, DAGE combines the possibly multiple paths between two nodes into a single path with a trainable operator that represents the intersection of relations and learns DAG-DL from tautologies. We show that it is possible to implement DAGE on top of existing query embedding methods, and we empirically measure the improvement of our method over the results of vanilla methods evaluated in tree-form queries that approximate the DAG queries of our proposed benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22105v1-abstract-full').style.display = 'none'; document.getElementById('2410.22105v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=He%2C+Y&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10