CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,561 results for author: <span class="mathjax">Zhao, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhao%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhao, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhao%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhao, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14405">arXiv:2411.14405</a> <span> [<a href="https://arxiv.org/pdf/2411.14405">pdf</a>, <a href="https://arxiv.org/format/2411.14405">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Huifeng Yin</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+B">Bo Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianqi Shi</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+C">Chenyang Lyu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Longyue Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weihua Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaifu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14405v1-abstract-short" style="display: inline;"> Currently OpenAI o1 has sparked a surge of interest in the study of large reasoning models (LRM). Building on this momentum, Marco-o1 not only focuses on disciplines with standard answers, such as mathematics, physics, and coding -- which are well-suited for reinforcement learning (RL) -- but also places greater emphasis on open-ended resolutions. We aim to address the question: "Can the o1 model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14405v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14405v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14405v1-abstract-full" style="display: none;"> Currently OpenAI o1 has sparked a surge of interest in the study of large reasoning models (LRM). Building on this momentum, Marco-o1 not only focuses on disciplines with standard answers, such as mathematics, physics, and coding -- which are well-suited for reinforcement learning (RL) -- but also places greater emphasis on open-ended resolutions. We aim to address the question: "Can the o1 model effectively generalize to broader domains where clear standards are absent and rewards are challenging to quantify?" Marco-o1 is powered by Chain-of-Thought (CoT) fine-tuning, Monte Carlo Tree Search (MCTS), reflection mechanisms, and innovative reasoning strategies -- optimized for complex real-world problem-solving tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14405v1-abstract-full').style.display = 'none'; document.getElementById('2411.14405v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13578">arXiv:2411.13578</a> <span> [<a href="https://arxiv.org/pdf/2411.13578">pdf</a>, <a href="https://arxiv.org/format/2411.13578">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> COOD: Concept-based Zero-shot OOD Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhendong Liu</a>, <a href="/search/cs?searchtype=author&query=Nian%2C+Y">Yi Nian</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H+P">Henry Peng Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiyang Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13578v1-abstract-short" style="display: inline;"> How can models effectively detect out-of-distribution (OOD) samples in complex, multi-label settings without extensive retraining? Existing OOD detection methods struggle to capture the intricate semantic relationships and label co-occurrences inherent in multi-label settings, often requiring large amounts of training data and failing to generalize to unseen label combinations. While large languag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13578v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13578v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13578v1-abstract-full" style="display: none;"> How can models effectively detect out-of-distribution (OOD) samples in complex, multi-label settings without extensive retraining? Existing OOD detection methods struggle to capture the intricate semantic relationships and label co-occurrences inherent in multi-label settings, often requiring large amounts of training data and failing to generalize to unseen label combinations. While large language models have revolutionized zero-shot OOD detection, they primarily focus on single-label scenarios, leaving a critical gap in handling real-world tasks where samples can be associated with multiple interdependent labels. To address these challenges, we introduce COOD, a novel zero-shot multi-label OOD detection framework. COOD leverages pre-trained vision-language models, enhancing them with a concept-based label expansion strategy and a new scoring function. By enriching the semantic space with both positive and negative concepts for each label, our approach models complex label dependencies, precisely differentiating OOD samples without the need for additional training. Extensive experiments demonstrate that our method significantly outperforms existing approaches, achieving approximately 95% average AUROC on both VOC and COCO datasets, while maintaining robust performance across varying numbers of labels and different types of OOD samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13578v1-abstract-full').style.display = 'none'; document.getElementById('2411.13578v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13534">arXiv:2411.13534</a> <span> [<a href="https://arxiv.org/pdf/2411.13534">pdf</a>, <a href="https://arxiv.org/format/2411.13534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Predictive Insights into LGBTQ+ Minority Stress: A Transductive Exploration of Social Media Discourse </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chapagain%2C+S">S. Chapagain</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Y. Zhao</a>, <a href="/search/cs?searchtype=author&query=Rohleen%2C+T+K">T. K. Rohleen</a>, <a href="/search/cs?searchtype=author&query=Hamdi%2C+S+M">S. M. Hamdi</a>, <a href="/search/cs?searchtype=author&query=Boubrahimi%2C+S+F">S. F. Boubrahimi</a>, <a href="/search/cs?searchtype=author&query=Flinn%2C+R+E">R. E. Flinn</a>, <a href="/search/cs?searchtype=author&query=Lund%2C+E+M">E. M. Lund</a>, <a href="/search/cs?searchtype=author&query=Klooster%2C+D">D. Klooster</a>, <a href="/search/cs?searchtype=author&query=Scheer%2C+J+R">J. R. Scheer</a>, <a href="/search/cs?searchtype=author&query=Cascalheira%2C+C+J">C. J. Cascalheira</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13534v1-abstract-short" style="display: inline;"> Individuals who identify as sexual and gender minorities, including lesbian, gay, bisexual, transgender, queer, and others (LGBTQ+) are more likely to experience poorer health than their heterosexual and cisgender counterparts. One primary source that drives these health disparities is minority stress (i.e., chronic and social stressors unique to LGBTQ+ communities' experiences adapting to the dom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13534v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13534v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13534v1-abstract-full" style="display: none;"> Individuals who identify as sexual and gender minorities, including lesbian, gay, bisexual, transgender, queer, and others (LGBTQ+) are more likely to experience poorer health than their heterosexual and cisgender counterparts. One primary source that drives these health disparities is minority stress (i.e., chronic and social stressors unique to LGBTQ+ communities' experiences adapting to the dominant culture). This stress is frequently expressed in LGBTQ+ users' posts on social media platforms. However, these expressions are not just straightforward manifestations of minority stress. They involve linguistic complexity (e.g., idiom or lexical diversity), rendering them challenging for many traditional natural language processing methods to detect. In this work, we designed a hybrid model using Graph Neural Networks (GNN) and Bidirectional Encoder Representations from Transformers (BERT), a pre-trained deep language model to improve the classification performance of minority stress detection. We experimented with our model on a benchmark social media dataset for minority stress detection (LGBTQ+ MiSSoM+). The dataset is comprised of 5,789 human-annotated Reddit posts from LGBTQ+ subreddits. Our approach enables the extraction of hidden linguistic nuances through pretraining on a vast amount of raw data, while also engaging in transductive learning to jointly develop representations for both labeled training data and unlabeled test data. The RoBERTa-GCN model achieved an accuracy of 0.86 and an F1 score of 0.86, surpassing the performance of other baseline models in predicting LGBTQ+ minority stress. Improved prediction of minority stress expressions on social media could lead to digital health interventions to improve the wellbeing of LGBTQ+ people-a community with high rates of stress-sensitive health problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13534v1-abstract-full').style.display = 'none'; document.getElementById('2411.13534v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted in 2024 IEEE 11th International Conference on Data Science and Advanced Analytics (DSAA)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13001">arXiv:2411.13001</a> <span> [<a href="https://arxiv.org/pdf/2411.13001">pdf</a>, <a href="https://arxiv.org/format/2411.13001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Collaborative Feature-Logits Contrastive Learning for Open-Set Semi-Supervised Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+X">Xinhao Zhong</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+S">Siyu Jiao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yao Zhao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yunchao Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13001v1-abstract-short" style="display: inline;"> Current Semi-Supervised Object Detection (SSOD) methods enhance detector performance by leveraging large amounts of unlabeled data, assuming that both labeled and unlabeled data share the same label space. However, in open-set scenarios, the unlabeled dataset contains both in-distribution (ID) classes and out-of-distribution (OOD) classes. Applying semi-supervised detectors in such settings can le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13001v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13001v1-abstract-full" style="display: none;"> Current Semi-Supervised Object Detection (SSOD) methods enhance detector performance by leveraging large amounts of unlabeled data, assuming that both labeled and unlabeled data share the same label space. However, in open-set scenarios, the unlabeled dataset contains both in-distribution (ID) classes and out-of-distribution (OOD) classes. Applying semi-supervised detectors in such settings can lead to misclassifying OOD class as ID classes. To alleviate this issue, we propose a simple yet effective method, termed Collaborative Feature-Logits Detector (CFL-Detector). Specifically, we introduce a feature-level clustering method using contrastive loss to clarify vector boundaries in the feature space and highlight class differences. Additionally, by optimizing the logits-level uncertainty classification loss, the model enhances its ability to effectively distinguish between ID and OOD classes. Extensive experiments demonstrate that our method achieves state-of-the-art performance compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13001v1-abstract-full').style.display = 'none'; document.getElementById('2411.13001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12454">arXiv:2411.12454</a> <span> [<a href="https://arxiv.org/pdf/2411.12454">pdf</a>, <a href="https://arxiv.org/format/2411.12454">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> StrTune: Data Dependence-based Code Slicing for Binary Similarity Detection with Fine-tuned Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+K">Kaiyan He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yikun Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuehui Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yunhao Song</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yubo Zhao</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+D">Dawu Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12454v1-abstract-short" style="display: inline;"> Binary Code Similarity Detection (BCSD) is significant for software security as it can address binary tasks such as malicious code snippets identification and binary patch analysis by comparing code patterns. Recently, there has been a growing focus on artificial intelligence-based approaches in BCSD due to their scalability and generalization. Because binaries are compiled with different compilat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12454v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12454v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12454v1-abstract-full" style="display: none;"> Binary Code Similarity Detection (BCSD) is significant for software security as it can address binary tasks such as malicious code snippets identification and binary patch analysis by comparing code patterns. Recently, there has been a growing focus on artificial intelligence-based approaches in BCSD due to their scalability and generalization. Because binaries are compiled with different compilation configurations, existing approaches still face notable limitations when comparing binary similarity. First, BCSD requires analysis on code behavior, and existing work claims to extract semantic, but actually still makes analysis in terms of syntax. Second, directly extracting features from assembly sequences, existing work cannot address the issues of instruction reordering and different syntax expressions caused by various compilation configurations. In this paper, we propose StrTune, which slices binary code based on data dependence and perform slice-level fine-tuning. To address the first limitation, StrTune performs backward slicing based on data dependence to capture how a value is computed along the execution. Each slice reflects the collecting semantics of the code, which is stable across different compilation configurations. StrTune introduces flow types to emphasize the independence of computations between slices, forming a graph representation. To overcome the second limitation, based on slices corresponding to the same value computation but having different syntax representation, StrTune utilizes a Siamese Network to fine-tune such pairs, making their representations closer in the feature space. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12454v1-abstract-full').style.display = 'none'; document.getElementById('2411.12454v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12426">arXiv:2411.12426</a> <span> [<a href="https://arxiv.org/pdf/2411.12426">pdf</a>, <a href="https://arxiv.org/format/2411.12426">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Motif Channel Opened in a White-Box: Stereo Matching via Motif Correlation Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenting Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingshu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yong Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C+L+P">C. L. Philip Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12426v1-abstract-short" style="display: inline;"> Real-world applications of stereo matching, such as autonomous driving, place stringent demands on both safety and accuracy. However, learning-based stereo matching methods inherently suffer from the loss of geometric structures in certain feature channels, creating a bottleneck in achieving precise detail matching. Additionally, these methods lack interpretability due to the black-box nature of d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12426v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12426v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12426v1-abstract-full" style="display: none;"> Real-world applications of stereo matching, such as autonomous driving, place stringent demands on both safety and accuracy. However, learning-based stereo matching methods inherently suffer from the loss of geometric structures in certain feature channels, creating a bottleneck in achieving precise detail matching. Additionally, these methods lack interpretability due to the black-box nature of deep learning. In this paper, we propose MoCha-V2, a novel learning-based paradigm for stereo matching. MoCha-V2 introduces the Motif Correlation Graph (MCG) to capture recurring textures, which are referred to as ``motifs" within feature channels. These motifs reconstruct geometric structures and are learned in a more interpretable way. Subsequently, we integrate features from multiple frequency domains through wavelet inverse transformation. The resulting motif features are utilized to restore geometric structures in the stereo matching process. Experimental results demonstrate the effectiveness of MoCha-V2. MoCha-V2 achieved 1st place on the Middlebury benchmark at the time of its release. Code is available at https://github.com/ZYangChen/MoCha-Stereo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12426v1-abstract-full').style.display = 'none'; document.getElementById('2411.12426v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11659">arXiv:2411.11659</a> <span> [<a href="https://arxiv.org/pdf/2411.11659">pdf</a>, <a href="https://arxiv.org/ps/2411.11659">ps</a>, <a href="https://arxiv.org/format/2411.11659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Improving Data Curation of Software Vulnerability Patches through Uncertainty Quantification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunhua Zhao</a>, <a href="/search/cs?searchtype=author&query=Damevski%2C+K">Kostadin Damevski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11659v1-abstract-short" style="display: inline;"> The changesets (or patches) that fix open source software vulnerabilities form critical datasets for various machine learning security-enhancing applications, such as automated vulnerability patching and silent fix detection. These patch datasets are derived from extensive collections of historical vulnerability fixes, maintained in databases like the Common Vulnerabilities and Exposures list and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11659v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11659v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11659v1-abstract-full" style="display: none;"> The changesets (or patches) that fix open source software vulnerabilities form critical datasets for various machine learning security-enhancing applications, such as automated vulnerability patching and silent fix detection. These patch datasets are derived from extensive collections of historical vulnerability fixes, maintained in databases like the Common Vulnerabilities and Exposures list and the National Vulnerability Database. However, since these databases focus on rapid notification to the security community, they contain significant inaccuracies and omissions that have a negative impact on downstream software security quality assurance tasks. In this paper, we propose an approach employing Uncertainty Quantification (UQ) to curate datasets of publicly-available software vulnerability patches. Our methodology leverages machine learning models that incorporate UQ to differentiate between patches based on their potential utility. We begin by evaluating a number of popular UQ techniques, including Vanilla, Monte Carlo Dropout, and Model Ensemble, as well as homoscedastic and heteroscedastic models of noise. Our findings indicate that Model Ensemble and heteroscedastic models are the best choices for vulnerability patch datasets. Based on these UQ modeling choices, we propose a heuristic that uses UQ to filter out lower quality instances and select instances with high utility value from the vulnerability dataset. Using our approach, we observe an improvement in predictive performance and significant reduction of model training time (i.e., energy consumption) for a state-of-the-art vulnerability prediction model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11659v1-abstract-full').style.display = 'none'; document.getElementById('2411.11659v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11532">arXiv:2411.11532</a> <span> [<a href="https://arxiv.org/pdf/2411.11532">pdf</a>, <a href="https://arxiv.org/format/2411.11532">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> A Code Knowledge Graph-Enhanced System for LLM-Based Fuzz Driver Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hanxiang Xu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wei Ma</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Ting Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yanjie Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qiang Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11532v1-abstract-short" style="display: inline;"> The rapid development of large language models (LLMs) with advanced programming capabilities has paved the way for innovative approaches in software testing. Fuzz testing, a cornerstone for improving software reliability and detecting vulnerabilities, often relies on manually written fuzz drivers, limiting scalability and efficiency. To address this challenge, we propose CodeGraphGPT, a novel syst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11532v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11532v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11532v1-abstract-full" style="display: none;"> The rapid development of large language models (LLMs) with advanced programming capabilities has paved the way for innovative approaches in software testing. Fuzz testing, a cornerstone for improving software reliability and detecting vulnerabilities, often relies on manually written fuzz drivers, limiting scalability and efficiency. To address this challenge, we propose CodeGraphGPT, a novel system that integrates code knowledge graphs with an LLM-powered intelligent agent to automate the fuzz driver generation process. By framing fuzz driver creation as a code generation task, CodeGraphGPT leverages program analysis to construct a knowledge graph of code repositories, where nodes represent code entities, such as functions or files, and edges capture their relationships. This enables the system to generate tailored fuzz drivers and input seeds, resolve compilation errors, and analyze crash reports, all while adapting to specific API usage scenarios. Additionally, querying the knowledge graph helps identify precise testing targets and contextualize the purpose of each fuzz driver within the fuzzing loop. We evaluated CodeGraphGPT on eight open-source software projects, achieving an average improvement of 8.73\% in code coverage compared to state-of-the-art methods. Moreover, it reduced the manual workload in crash case analysis by 84.4\% and identified 11 real-world bugs, including nine previously unreported ones. This work highlights how integrating LLMs with code knowledge graphs enhances fuzz driver generation, offering an efficient solution for vulnerability detection and software quality improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11532v1-abstract-full').style.display = 'none'; document.getElementById('2411.11532v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11479">arXiv:2411.11479</a> <span> [<a href="https://arxiv.org/pdf/2411.11479">pdf</a>, <a href="https://arxiv.org/format/2411.11479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Quantifying Preferences of Vision-Language Models via Value Decomposition in Social Media Contexts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingxuan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuning Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shengqi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yizhou Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y+N">Ying Nian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11479v1-abstract-short" style="display: inline;"> The rapid advancement of Vision-Language Models (VLMs) has expanded multimodal applications, yet evaluations often focus on basic tasks like object recognition, overlooking abstract aspects such as personalities and values. To address this gap, we introduce Value-Spectrum, a visual question-answering benchmark aimed at assessing VLMs based on Schwartz's value dimensions, which capture core values… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11479v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11479v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11479v1-abstract-full" style="display: none;"> The rapid advancement of Vision-Language Models (VLMs) has expanded multimodal applications, yet evaluations often focus on basic tasks like object recognition, overlooking abstract aspects such as personalities and values. To address this gap, we introduce Value-Spectrum, a visual question-answering benchmark aimed at assessing VLMs based on Schwartz's value dimensions, which capture core values guiding people's beliefs and actions across cultures. We constructed a vectorized database of over 50,000 short videos sourced from TikTok, YouTube Shorts, and Instagram Reels, covering multiple months and a wide array of topics such as family, health, hobbies, society, and technology. We also developed a VLM agent pipeline to automate video browsing and analysis. Benchmarking representative VLMs on Value-Spectrum reveals significant differences in their responses to value-oriented content, with most models exhibiting a preference for hedonistic topics. Beyond identifying natural preferences, we explored the ability of VLM agents to adopt specific personas when explicitly prompted, revealing insights into the models' adaptability in role-playing scenarios. These findings highlight the potential of Value-Spectrum as a comprehensive evaluation set for tracking VLM advancements in value-based tasks and for developing more sophisticated role-playing AI agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11479v1-abstract-full').style.display = 'none'; document.getElementById('2411.11479v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08728">arXiv:2411.08728</a> <span> [<a href="https://arxiv.org/pdf/2411.08728">pdf</a>, <a href="https://arxiv.org/format/2411.08728">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Polymetis:Large Language Modeling for Multiple Material Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chao Huang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Huichen Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chunyan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yi Zhao</a>, <a href="/search/cs?searchtype=author&query=Du%2C+S">Shiyu Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+H">He Sha</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+R">Ruixin Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08728v1-abstract-short" style="display: inline;"> As the application of large language models in various fields continues to expand, materials science also ushers in opportunities for AI-driven innovation. The traditional way of relying on manual search for materials science-related information is now using artificial intelligence technology as an auxiliary tool to improve the efficiency of materials science research. To accelerate researchers' k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08728v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08728v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08728v1-abstract-full" style="display: none;"> As the application of large language models in various fields continues to expand, materials science also ushers in opportunities for AI-driven innovation. The traditional way of relying on manual search for materials science-related information is now using artificial intelligence technology as an auxiliary tool to improve the efficiency of materials science research. To accelerate researchers' knowledge acquisition and intelligent decision-making support in materials science research, this paper proposes a large language model Polymetis model for a variety of materials fields, aiming to provide highly professional knowledge answers in the field of materials, covering energy materials, functional materials, alloy materials, physical chemistry, biology, and other material directions. The model uses a dataset of about 2 million material knowledge instructions, and in the process of building the dataset, we developed the Intelligent Extraction Large Model (IELM), which is specially used to extract and form structured knowledge from scientific texts, avoiding a large number of costs that need to be manually annotated, and improving efficiency. We inject this data into the GLM4-9B model for learning to enhance its inference capabilities in a variety of material domains. In addition, we have introduced enhanced prompt strategies to ensure that the answers to the model are more organized and comprehensive, providing efficient and comprehensive intelligent support for the diverse needs of materials science exploration, and promoting the development of material science. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08728v1-abstract-full').style.display = 'none'; document.getElementById('2411.08728v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08448">arXiv:2411.08448</a> <span> [<a href="https://arxiv.org/pdf/2411.08448">pdf</a>, <a href="https://arxiv.org/format/2411.08448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3652892.3700757">10.1145/3652892.3700757 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> In Serverless, OS Scheduler Choice Costs Money: A Hybrid Scheduling Approach for Cheaper FaaS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuxuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+W">Weikang Weng</a>, <a href="/search/cs?searchtype=author&query=van+Nieuwpoort%2C+R">Rob van Nieuwpoort</a>, <a href="/search/cs?searchtype=author&query=Uta%2C+A">Alexandru Uta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08448v1-abstract-short" style="display: inline;"> In Function-as-a-Service (FaaS) serverless, large applications are split into short-lived stateless functions. Deploying functions is mutually profitable: users need not be concerned with resource management, while providers can keep their servers at high utilization rates running thousands of functions concurrently on a single machine. It is exactly this high concurrency that comes at a cost. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08448v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08448v1-abstract-full" style="display: none;"> In Function-as-a-Service (FaaS) serverless, large applications are split into short-lived stateless functions. Deploying functions is mutually profitable: users need not be concerned with resource management, while providers can keep their servers at high utilization rates running thousands of functions concurrently on a single machine. It is exactly this high concurrency that comes at a cost. The standard Linux Completely Fair Scheduler (CFS) switches often between tasks, which leads to prolonged execution times. We present evidence that relying on the default Linux CFS scheduler increases serverless workloads cost by up to 10X. In this article, we raise awareness and make a case for rethinking the OS-level scheduling in Linux for serverless workloads composed of many short-lived processes. To make serverless more affordable we introduce a hybrid two-level scheduling approach that relies on FaaS characteristics. Short-running functions are executed in FIFO fashion without preemption, while longer-running functions are passed to CFS after a certain time period. We show that tailor-made OS scheduling is able to significantly reduce user-facing costs without adding any provider-facing overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08448v1-abstract-full').style.display = 'none'; document.getElementById('2411.08448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Middleware 2024, author draft made available for timely dissemination</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08284">arXiv:2411.08284</a> <span> [<a href="https://arxiv.org/pdf/2411.08284">pdf</a>, <a href="https://arxiv.org/format/2411.08284">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Thresholding Algorithm with Memory for Linear Inverse Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhong-Feng Sun</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yun-Bin Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jin-Chuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zheng-Hai Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08284v1-abstract-short" style="display: inline;"> The relaxed optimal $k$-thresholding pursuit (ROTP) is a recent algorithm for linear inverse problems. This algorithm is based on the optimal $k$-thresholding technique which performs vector thresholding and error metric reduction simultaneously. Although ROTP can be used to solve small to medium-sized linear inverse problems, the computational cost of this algorithm is high when solving large-sca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08284v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08284v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08284v1-abstract-full" style="display: none;"> The relaxed optimal $k$-thresholding pursuit (ROTP) is a recent algorithm for linear inverse problems. This algorithm is based on the optimal $k$-thresholding technique which performs vector thresholding and error metric reduction simultaneously. Although ROTP can be used to solve small to medium-sized linear inverse problems, the computational cost of this algorithm is high when solving large-scale problems. By merging the optimal $k$-thresholding technique and iterative method with memory as well as optimization with sparse search directions, we propose the so-called dynamic thresholding algorithm with memory (DTAM), which iteratively and dynamically selects vector bases to construct the problem solution. At every step, the algorithm uses more than one or all iterates generated so far to construct a new search direction, and solves only the small-sized quadratic subproblems at every iteration. Thus the computational complexity of DTAM is remarkably lower than that of ROTP-type methods. It turns out that DTAM can locate the solution of linear inverse problems if the matrix involved satisfies the restricted isometry property. Experiments on synthetic data, audio signal reconstruction and image denoising demonstrate that the proposed algorithm performs comparably to several mainstream thresholding and greedy algorithms, and it works much faster than the ROTP-type algorithms especially when the sparsity level of signal is relatively low. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08284v1-abstract-full').style.display = 'none'; document.getElementById('2411.08284v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08227">arXiv:2411.08227</a> <span> [<a href="https://arxiv.org/pdf/2411.08227">pdf</a>, <a href="https://arxiv.org/format/2411.08227">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DPU: Dynamic Prototype Updating for Multimodal Out-of-Distribution Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shawn Li</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+H">Huixian Gong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hao Dong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tiankai Yang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhengzhong Tu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08227v1-abstract-short" style="display: inline;"> Out-of-distribution (OOD) detection is essential for ensuring the robustness of machine learning models by identifying samples that deviate from the training distribution. While traditional OOD detection has primarily focused on single-modality inputs, such as images, recent advances in multimodal models have demonstrated the potential of leveraging multiple modalities (e.g., video, optical flow,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08227v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08227v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08227v1-abstract-full" style="display: none;"> Out-of-distribution (OOD) detection is essential for ensuring the robustness of machine learning models by identifying samples that deviate from the training distribution. While traditional OOD detection has primarily focused on single-modality inputs, such as images, recent advances in multimodal models have demonstrated the potential of leveraging multiple modalities (e.g., video, optical flow, audio) to enhance detection performance. However, existing methods often overlook intra-class variability within in-distribution (ID) data, assuming that samples of the same class are perfectly cohesive and consistent. This assumption can lead to performance degradation, especially when prediction discrepancies are uniformly amplified across all samples. To address this issue, we propose Dynamic Prototype Updating (DPU), a novel plug-and-play framework for multimodal OOD detection that accounts for intra-class variations. Our method dynamically updates class center representations for each class by measuring the variance of similar samples within each batch, enabling adaptive adjustments. This approach allows us to amplify prediction discrepancies based on the updated class centers, thereby improving the model's robustness and generalization across different modalities. Extensive experiments on two tasks, five datasets, and nine base OOD algorithms demonstrate that DPU significantly improves OOD detection performance, setting a new state-of-the-art in multimodal OOD detection, with improvements of up to 80 percent in Far-OOD detection. To facilitate accessibility and reproducibility, our code is publicly available on GitHub. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08227v1-abstract-full').style.display = 'none'; document.getElementById('2411.08227v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07518">arXiv:2411.07518</a> <span> [<a href="https://arxiv.org/pdf/2411.07518">pdf</a>, <a href="https://arxiv.org/format/2411.07518">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> LLM App Squatting and Cloning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yinglin Xie</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+X">Xinyi Hou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yanjie Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07518v1-abstract-short" style="display: inline;"> Impersonation tactics, such as app squatting and app cloning, have posed longstanding challenges in mobile app stores, where malicious actors exploit the names and reputations of popular apps to deceive users. With the rapid growth of Large Language Model (LLM) stores like GPT Store and FlowGPT, these issues have similarly surfaced, threatening the integrity of the LLM app ecosystem. In this study… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07518v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07518v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07518v1-abstract-full" style="display: none;"> Impersonation tactics, such as app squatting and app cloning, have posed longstanding challenges in mobile app stores, where malicious actors exploit the names and reputations of popular apps to deceive users. With the rapid growth of Large Language Model (LLM) stores like GPT Store and FlowGPT, these issues have similarly surfaced, threatening the integrity of the LLM app ecosystem. In this study, we present the first large-scale analysis of LLM app squatting and cloning using our custom-built tool, LLMappCrazy. LLMappCrazy covers 14 squatting generation techniques and integrates Levenshtein distance and BERT-based semantic analysis to detect cloning by analyzing app functional similarities. Using this tool, we generated variations of the top 1000 app names and found over 5,000 squatting apps in the dataset. Additionally, we observed 3,509 squatting apps and 9,575 cloning cases across six major platforms. After sampling, we find that 18.7% of the squatting apps and 4.9% of the cloning apps exhibited malicious behavior, including phishing, malware distribution, fake content dissemination, and aggressive ad injection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07518v1-abstract-full').style.display = 'none'; document.getElementById('2411.07518v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05873">arXiv:2411.05873</a> <span> [<a href="https://arxiv.org/pdf/2411.05873">pdf</a>, <a href="https://arxiv.org/format/2411.05873">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Poor Man's Training on MCUs: A Memory-Efficient Quantized Back-Propagation-Free Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yequan Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hai Li</a>, <a href="/search/cs?searchtype=author&query=Young%2C+I">Ian Young</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05873v1-abstract-short" style="display: inline;"> Back propagation (BP) is the default solution for gradient computation in neural network training. However, implementing BP-based training on various edge devices such as FPGA, microcontrollers (MCUs), and analog computing platforms face multiple major challenges, such as the lack of hardware resources, long time-to-market, and dramatic errors in a low-precision setting. This paper presents a simp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05873v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05873v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05873v1-abstract-full" style="display: none;"> Back propagation (BP) is the default solution for gradient computation in neural network training. However, implementing BP-based training on various edge devices such as FPGA, microcontrollers (MCUs), and analog computing platforms face multiple major challenges, such as the lack of hardware resources, long time-to-market, and dramatic errors in a low-precision setting. This paper presents a simple BP-free training scheme on an MCU, which makes edge training hardware design as easy as inference hardware design. We adopt a quantized zeroth-order method to estimate the gradients of quantized model parameters, which can overcome the error of a straight-through estimator in a low-precision BP scheme. We further employ a few dimension reduction methods (e.g., node perturbation, sparse training) to improve the convergence of zeroth-order training. Experiment results show that our BP-free training achieves comparable performance as BP-based training on adapting a pre-trained image classifier to various corrupted data on resource-constrained edge devices (e.g., an MCU with 1024-KB SRAM for dense full-model training, or an MCU with 256-KB SRAM for sparse training). This method is most suitable for application scenarios where memory cost and time-to-market are the major concerns, but longer latency can be tolerated. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05873v1-abstract-full').style.display = 'none'; document.getElementById('2411.05873v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; C.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05764">arXiv:2411.05764</a> <span> [<a href="https://arxiv.org/pdf/2411.05764">pdf</a>, <a href="https://arxiv.org/format/2411.05764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FinDVer: Explainable Claim Verification over Long and Hybrid-Content Financial Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yilun Zhao</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Y">Yitao Long</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuru Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengye Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weiyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongjun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiangru Tang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Cohan%2C+A">Arman Cohan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05764v1-abstract-short" style="display: inline;"> We introduce FinDVer, a comprehensive benchmark specifically designed to evaluate the explainable claim verification capabilities of LLMs in the context of understanding and analyzing long, hybrid-content financial documents. FinDVer contains 2,400 expert-annotated examples, divided into three subsets: information extraction, numerical reasoning, and knowledge-intensive reasoning, each addressing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05764v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05764v1-abstract-full" style="display: none;"> We introduce FinDVer, a comprehensive benchmark specifically designed to evaluate the explainable claim verification capabilities of LLMs in the context of understanding and analyzing long, hybrid-content financial documents. FinDVer contains 2,400 expert-annotated examples, divided into three subsets: information extraction, numerical reasoning, and knowledge-intensive reasoning, each addressing common scenarios encountered in real-world financial contexts. We assess a broad spectrum of LLMs under long-context and RAG settings. Our results show that even the current best-performing system, GPT-4o, still lags behind human experts. We further provide in-depth analysis on long-context and RAG setting, Chain-of-Thought reasoning, and model reasoning errors, offering insights to drive future advancements. We believe that FinDVer can serve as a valuable benchmark for evaluating LLMs in claim verification over complex, expert-domain documents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05764v1-abstract-full').style.display = 'none'; document.getElementById('2411.05764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05651">arXiv:2411.05651</a> <span> [<a href="https://arxiv.org/pdf/2411.05651">pdf</a>, <a href="https://arxiv.org/format/2411.05651">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> LightVA: Lightweight Visual Analytics with LLM Agent-Based Task Planning and Execution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junjie Wang</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+L">Linbin Xiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zifei Guo</a>, <a href="/search/cs?searchtype=author&query=Turkay%2C+C">Cagatay Turkay</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siming Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05651v1-abstract-short" style="display: inline;"> Visual analytics (VA) requires analysts to iteratively propose analysis tasks based on observations and execute tasks by creating visualizations and interactive exploration to gain insights. This process demands skills in programming, data processing, and visualization tools, highlighting the need for a more intelligent, streamlined VA approach. Large language models (LLMs) have recently been deve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05651v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05651v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05651v1-abstract-full" style="display: none;"> Visual analytics (VA) requires analysts to iteratively propose analysis tasks based on observations and execute tasks by creating visualizations and interactive exploration to gain insights. This process demands skills in programming, data processing, and visualization tools, highlighting the need for a more intelligent, streamlined VA approach. Large language models (LLMs) have recently been developed as agents to handle various tasks with dynamic planning and tool-using capabilities, offering the potential to enhance the efficiency and versatility of VA. We propose LightVA, a lightweight VA framework that supports task decomposition, data analysis, and interactive exploration through human-agent collaboration. Our method is designed to help users progressively translate high-level analytical goals into low-level tasks, producing visualizations and deriving insights. Specifically, we introduce an LLM agent-based task planning and execution strategy, employing a recursive process involving a planner, executor, and controller. The planner is responsible for recommending and decomposing tasks, the executor handles task execution, including data analysis, visualization generation and multi-view composition, and the controller coordinates the interaction between the planner and executor. Building on the framework, we develop a system with a hybrid user interface that includes a task flow diagram for monitoring and managing the task planning process, a visualization panel for interactive data exploration, and a chat view for guiding the model through natural language instructions. We examine the effectiveness of our method through a usage scenario and an expert study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05651v1-abstract-full').style.display = 'none'; document.getElementById('2411.05651v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05345">arXiv:2411.05345</a> <span> [<a href="https://arxiv.org/pdf/2411.05345">pdf</a>, <a href="https://arxiv.org/format/2411.05345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reasoning Robustness of LLMs to Adversarial Typographical Errors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gan%2C+E">Esther Gan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yiran Zhao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Liying Cheng</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yancan Mao</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+A">Anirudh Goyal</a>, <a href="/search/cs?searchtype=author&query=Kawaguchi%2C+K">Kenji Kawaguchi</a>, <a href="/search/cs?searchtype=author&query=Kan%2C+M">Min-Yen Kan</a>, <a href="/search/cs?searchtype=author&query=Shieh%2C+M">Michael Shieh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05345v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated impressive capabilities in reasoning using Chain-of-Thought (CoT) prompting. However, CoT can be biased by users' instruction. In this work, we study the reasoning robustness of LLMs to typographical errors, which can naturally occur in users' queries. We design an Adversarial Typo Attack ($\texttt{ATA}$) algorithm that iteratively samples typos for w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05345v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05345v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated impressive capabilities in reasoning using Chain-of-Thought (CoT) prompting. However, CoT can be biased by users' instruction. In this work, we study the reasoning robustness of LLMs to typographical errors, which can naturally occur in users' queries. We design an Adversarial Typo Attack ($\texttt{ATA}$) algorithm that iteratively samples typos for words that are important to the query and selects the edit that is most likely to succeed in attacking. It shows that LLMs are sensitive to minimal adversarial typographical changes. Notably, with 1 character edit, Mistral-7B-Instruct's accuracy drops from 43.7% to 38.6% on GSM8K, while with 8 character edits the performance further drops to 19.2%. To extend our evaluation to larger and closed-source LLMs, we develop the $\texttt{R$^2$ATA}$ benchmark, which assesses models' $\underline{R}$easoning $\underline{R}$obustness to $\underline{\texttt{ATA}}$. It includes adversarial typographical questions derived from three widely used reasoning datasets-GSM8K, BBH, and MMLU-by applying $\texttt{ATA}$ to open-source LLMs. $\texttt{R$^2$ATA}$ demonstrates remarkable transferability and causes notable performance drops across multiple super large and closed-source LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05345v1-abstract-full').style.display = 'none'; document.getElementById('2411.05345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05335">arXiv:2411.05335</a> <span> [<a href="https://arxiv.org/pdf/2411.05335">pdf</a>, <a href="https://arxiv.org/format/2411.05335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Quality-Centric Framework for Generic Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+W">Wentang Song</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhiyuan Yan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuzhen Lin</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Changsheng Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shen Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yandan Zhao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shouhong Ding</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05335v1-abstract-short" style="display: inline;"> This paper addresses the generalization issue in deepfake detection by harnessing forgery quality in training data. Generally, the forgery quality of different deepfakes varies: some have easily recognizable forgery clues, while others are highly realistic. Existing works often train detectors on a mix of deepfakes with varying forgery qualities, potentially leading detectors to short-cut the easy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05335v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05335v1-abstract-full" style="display: none;"> This paper addresses the generalization issue in deepfake detection by harnessing forgery quality in training data. Generally, the forgery quality of different deepfakes varies: some have easily recognizable forgery clues, while others are highly realistic. Existing works often train detectors on a mix of deepfakes with varying forgery qualities, potentially leading detectors to short-cut the easy-to-spot artifacts from low-quality forgery samples, thereby hurting generalization performance. To tackle this issue, we propose a novel quality-centric framework for generic deepfake detection, which is composed of a Quality Evaluator, a low-quality data enhancement module, and a learning pacing strategy that explicitly incorporates forgery quality into the training process. The framework is inspired by curriculum learning, which is designed to gradually enable the detector to learn more challenging deepfake samples, starting with easier samples and progressing to more realistic ones. We employ both static and dynamic assessments to assess the forgery quality, combining their scores to produce a final rating for each training sample. The rating score guides the selection of deepfake samples for training, with higher-rated samples having a higher probability of being chosen. Furthermore, we propose a novel frequency data augmentation method specifically designed for low-quality forgery samples, which helps to reduce obvious forgery traces and improve their overall realism. Extensive experiments show that our method can be applied in a plug-and-play manner and significantly enhance the generalization performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05335v1-abstract-full').style.display = 'none'; document.getElementById('2411.05335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05292">arXiv:2411.05292</a> <span> [<a href="https://arxiv.org/pdf/2411.05292">pdf</a>, <a href="https://arxiv.org/format/2411.05292">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SimpleBEV: Improved LiDAR-Camera Fusion Architecture for 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yun Zhao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zhan Gong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+P">Peiru Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hong Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shaohua Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05292v1-abstract-short" style="display: inline;"> More and more research works fuse the LiDAR and camera information to improve the 3D object detection of the autonomous driving system. Recently, a simple yet effective fusion framework has achieved an excellent detection performance, fusing the LiDAR and camera features in a unified bird's-eye-view (BEV) space. In this paper, we propose a LiDAR-camera fusion framework, named SimpleBEV, for accura… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05292v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05292v1-abstract-full" style="display: none;"> More and more research works fuse the LiDAR and camera information to improve the 3D object detection of the autonomous driving system. Recently, a simple yet effective fusion framework has achieved an excellent detection performance, fusing the LiDAR and camera features in a unified bird's-eye-view (BEV) space. In this paper, we propose a LiDAR-camera fusion framework, named SimpleBEV, for accurate 3D object detection, which follows the BEV-based fusion framework and improves the camera and LiDAR encoders, respectively. Specifically, we perform the camera-based depth estimation using a cascade network and rectify the depth results with the depth information derived from the LiDAR points. Meanwhile, an auxiliary branch that implements the 3D object detection using only the camera-BEV features is introduced to exploit the camera information during the training phase. Besides, we improve the LiDAR feature extractor by fusing the multi-scaled sparse convolutional features. Experimental results demonstrate the effectiveness of our proposed method. Our method achieves 77.6\% NDS accuracy on the nuScenes dataset, showcasing superior performance in the 3D object detection track. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05292v1-abstract-full').style.display = 'none'; document.getElementById('2411.05292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05274">arXiv:2411.05274</a> <span> [<a href="https://arxiv.org/pdf/2411.05274">pdf</a>, <a href="https://arxiv.org/format/2411.05274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Distributed-Order Fractional Graph Operating Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kai Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuhao Li</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Q">Qiyu Kang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+F">Feng Ji</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Q">Qinxu Ding</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yanan Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wenfei Liang</a>, <a href="/search/cs?searchtype=author&query=Tay%2C+W+P">Wee Peng Tay</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05274v1-abstract-short" style="display: inline;"> We introduce the Distributed-order fRActional Graph Operating Network (DRAGON), a novel continuous Graph Neural Network (GNN) framework that incorporates distributed-order fractional calculus. Unlike traditional continuous GNNs that utilize integer-order or single fractional-order differential equations, DRAGON uses a learnable probability distribution over a range of real numbers for the derivati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05274v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05274v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05274v1-abstract-full" style="display: none;"> We introduce the Distributed-order fRActional Graph Operating Network (DRAGON), a novel continuous Graph Neural Network (GNN) framework that incorporates distributed-order fractional calculus. Unlike traditional continuous GNNs that utilize integer-order or single fractional-order differential equations, DRAGON uses a learnable probability distribution over a range of real numbers for the derivative orders. By allowing a flexible and learnable superposition of multiple derivative orders, our framework captures complex graph feature updating dynamics beyond the reach of conventional models. We provide a comprehensive interpretation of our framework's capability to capture intricate dynamics through the lens of a non-Markovian graph random walk with node feature updating driven by an anomalous diffusion process over the graph. Furthermore, to highlight the versatility of the DRAGON framework, we conduct empirical evaluations across a range of graph learning tasks. The results consistently demonstrate superior performance when compared to traditional continuous GNN models. The implementation code is available at \url{https://github.com/zknus/NeurIPS-2024-DRAGON}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05274v1-abstract-full').style.display = 'none'; document.getElementById('2411.05274v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05197">arXiv:2411.05197</a> <span> [<a href="https://arxiv.org/pdf/2411.05197">pdf</a>, <a href="https://arxiv.org/format/2411.05197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Hardware and Software Platform Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Foerster%2C+H">Hanna Foerster</a>, <a href="/search/cs?searchtype=author&query=Mullins%2C+R+D">Robert D. Mullins</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yiren Zhao</a>, <a href="/search/cs?searchtype=author&query=Shumailov%2C+I">Ilia Shumailov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05197v1-abstract-short" style="display: inline;"> It is now a common business practice to buy access to large language model (LLM) inference rather than self-host, because of significant upfront hardware infrastructure and energy costs. However, as a buyer, there is no mechanism to verify the authenticity of the advertised service including the serving hardware platform, e.g. that it is actually being served using an NVIDIA H100. Furthermore, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05197v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05197v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05197v1-abstract-full" style="display: none;"> It is now a common business practice to buy access to large language model (LLM) inference rather than self-host, because of significant upfront hardware infrastructure and energy costs. However, as a buyer, there is no mechanism to verify the authenticity of the advertised service including the serving hardware platform, e.g. that it is actually being served using an NVIDIA H100. Furthermore, there are reports suggesting that model providers may deliver models that differ slightly from the advertised ones, often to make them run on less expensive hardware. That way, a client pays premium for a capable model access on more expensive hardware, yet ends up being served by a (potentially less capable) cheaper model on cheaper hardware. In this paper we introduce \textit{\textbf{hardware and software platform inference (HSPI)}} -- a method for identifying the underlying \GPU{} architecture and software stack of a (black-box) machine learning model solely based on its input-output behavior. Our method leverages the inherent differences of various \GPU{} architectures and compilers to distinguish between different \GPU{} types and software stacks. By analyzing the numerical patterns in the model's outputs, we propose a classification framework capable of accurately identifying the \GPU{} used for model inference as well as the underlying software configuration. Our findings demonstrate the feasibility of inferring \GPU{} type from black-box models. We evaluate HSPI against models served on different real hardware and find that in a white-box setting we can distinguish between different \GPU{}s with between $83.9\%$ and $100\%$ accuracy. Even in a black-box setting we are able to achieve results that are up to three times higher than random guess accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05197v1-abstract-full').style.display = 'none'; document.getElementById('2411.05197v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04406">arXiv:2411.04406</a> <span> [<a href="https://arxiv.org/pdf/2411.04406">pdf</a>, <a href="https://arxiv.org/format/2411.04406">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Image Understanding Makes for A Good Tokenizer for Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luting Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zijian Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jiashi Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Si Liu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+B">Bingyi Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04406v1-abstract-short" style="display: inline;"> Abstract Modern image generation (IG) models have been shown to capture rich semantics valuable for image understanding (IU) tasks. However, the potential of IU models to improve IG performance remains uncharted. We address this issue using a token-based IG framework, which relies on effective tokenizers to project images into token sequences. Currently, pixel reconstruction (e.g., VQGAN) dominate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04406v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04406v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04406v1-abstract-full" style="display: none;"> Abstract Modern image generation (IG) models have been shown to capture rich semantics valuable for image understanding (IU) tasks. However, the potential of IU models to improve IG performance remains uncharted. We address this issue using a token-based IG framework, which relies on effective tokenizers to project images into token sequences. Currently, pixel reconstruction (e.g., VQGAN) dominates the training objective for image tokenizers. In contrast, our approach adopts the feature reconstruction objective, where tokenizers are trained by distilling knowledge from pretrained IU encoders. Comprehensive comparisons indicate that tokenizers with strong IU capabilities achieve superior IG performance across a variety of metrics, datasets, tasks, and proposal networks. Notably, VQ-KD CLIP achieves $4.10$ FID on ImageNet-1k (IN-1k). Visualization suggests that the superiority of VQ-KD can be partly attributed to the rich semantics within the VQ-KD codebook. We further introduce a straightforward pipeline to directly transform IU encoders into tokenizers, demonstrating exceptional effectiveness for IG tasks. These discoveries may energize further exploration into image tokenizer research and inspire the community to reassess the relationship between IU and IG. The code is released at https://github.com/magic-research/vector_quantization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04406v1-abstract-full').style.display = 'none'; document.getElementById('2411.04406v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04075">arXiv:2411.04075</a> <span> [<a href="https://arxiv.org/pdf/2411.04075">pdf</a>, <a href="https://arxiv.org/format/2411.04075">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> M3SciQA: A Multi-Modal Multi-Document Scientific QA Benchmark for Evaluating Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuhan Li</a>, <a href="/search/cs?searchtype=author&query=Shangguan%2C+Z">Ziyao Shangguan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yilun Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Deyuan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yixin Liu</a>, <a href="/search/cs?searchtype=author&query=Cohan%2C+A">Arman Cohan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04075v1-abstract-short" style="display: inline;"> Existing benchmarks for evaluating foundation models mainly focus on single-document, text-only tasks. However, they often fail to fully capture the complexity of research workflows, which typically involve interpreting non-textual data and gathering information across multiple documents. To address this gap, we introduce M3SciQA, a multi-modal, multi-document scientific question answering benchma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04075v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04075v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04075v1-abstract-full" style="display: none;"> Existing benchmarks for evaluating foundation models mainly focus on single-document, text-only tasks. However, they often fail to fully capture the complexity of research workflows, which typically involve interpreting non-textual data and gathering information across multiple documents. To address this gap, we introduce M3SciQA, a multi-modal, multi-document scientific question answering benchmark designed for a more comprehensive evaluation of foundation models. M3SciQA consists of 1,452 expert-annotated questions spanning 70 natural language processing paper clusters, where each cluster represents a primary paper along with all its cited documents, mirroring the workflow of comprehending a single paper by requiring multi-modal and multi-document data. With M3SciQA, we conduct a comprehensive evaluation of 18 foundation models. Our results indicate that current foundation models still significantly underperform compared to human experts in multi-modal information retrieval and in reasoning across multiple scientific documents. Additionally, we explore the implications of these findings for the future advancement of applying foundation models in multi-modal scientific literature analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04075v1-abstract-full').style.display = 'none'; document.getElementById('2411.04075v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03668">arXiv:2411.03668</a> <span> [<a href="https://arxiv.org/pdf/2411.03668">pdf</a>, <a href="https://arxiv.org/format/2411.03668">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Mobile Recording Device Recognition Based Cross-Scale and Multi-Level Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+C">Chunyan Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuhao Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhifeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03668v1-abstract-short" style="display: inline;"> This paper introduces a modeling approach that employs multi-level global processing, encompassing both short-term frame-level and long-term sample-level feature scales. In the initial stage of shallow feature extraction, various scales are employed to extract multi-level features, including Mel-Frequency Cepstral Coefficients (MFCC) and pre-Fbank log energy spectrum. The construction of the ident… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03668v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03668v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03668v1-abstract-full" style="display: none;"> This paper introduces a modeling approach that employs multi-level global processing, encompassing both short-term frame-level and long-term sample-level feature scales. In the initial stage of shallow feature extraction, various scales are employed to extract multi-level features, including Mel-Frequency Cepstral Coefficients (MFCC) and pre-Fbank log energy spectrum. The construction of the identification network model involves considering the input two-dimensional temporal features from both frame and sample levels. Specifically, the model initially employs one-dimensional convolution-based Convolutional Long Short-Term Memory (ConvLSTM) to fuse spatiotemporal information and extract short-term frame-level features. Subsequently, bidirectional long Short-Term Memory (BiLSTM) is utilized to learn long-term sample-level sequential representations. The transformer encoder then performs cross-scale, multi-level processing on global frame-level and sample-level features, facilitating deep feature representation and fusion at both levels. Finally, recognition results are obtained through Softmax. Our method achieves an impressive 99.6% recognition accuracy on the CCNU_Mobile dataset, exhibiting a notable improvement of 2% to 12% compared to the baseline system. Additionally, we thoroughly investigate the transferability of our model, achieving an 87.9% accuracy in a classification task on a new dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03668v1-abstract-full').style.display = 'none'; document.getElementById('2411.03668v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03321">arXiv:2411.03321</a> <span> [<a href="https://arxiv.org/pdf/2411.03321">pdf</a>, <a href="https://arxiv.org/format/2411.03321">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Will Trump Win in 2024? Predicting the US Presidential Election via Multi-step Reasoning with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chenxiao Yu</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+Z">Zhaotian Weng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiyang Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03321v1-abstract-short" style="display: inline;"> Can Large Language Models (LLMs) accurately predict election outcomes? While LLMs have demonstrated impressive performance in various domains, including healthcare, legal analysis, and creative tasks, their ability to forecast elections remains unknown. Election prediction poses unique challenges, such as limited voter-level data, rapidly changing political landscapes, and the need to model comple… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03321v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03321v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03321v1-abstract-full" style="display: none;"> Can Large Language Models (LLMs) accurately predict election outcomes? While LLMs have demonstrated impressive performance in various domains, including healthcare, legal analysis, and creative tasks, their ability to forecast elections remains unknown. Election prediction poses unique challenges, such as limited voter-level data, rapidly changing political landscapes, and the need to model complex human behavior. To address these challenges, we introduce a multi-step reasoning framework designed for political analysis. Our approach is validated on real-world data from the American National Election Studies (ANES) 2016 and 2020, as well as synthetic personas generated by the leading machine learning framework, offering scalable datasets for voter behavior modeling. To capture temporal dynamics, we incorporate candidates' policy positions and biographical details, ensuring that the model adapts to evolving political contexts. Drawing on Chain of Thought prompting, our multi-step reasoning pipeline systematically integrates demographic, ideological, and time-dependent factors, enhancing the model's predictive power. Additionally, we apply our framework to predict the outcome of the 2024 U.S. presidential election in advance, demonstrating the adaptability of LLMs to unseen political data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03321v1-abstract-full').style.display = 'none'; document.getElementById('2411.03321v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This research is ongoing work. Xiyang Hu and Yue Zhao are the corresponding authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02983">arXiv:2411.02983</a> <span> [<a href="https://arxiv.org/pdf/2411.02983">pdf</a>, <a href="https://arxiv.org/format/2411.02983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Autonomous Decision Making for UAV Cooperative Pursuit-Evasion Game with Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Z">Zidong Nie</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+K">Kangsheng Dong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qinghua Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuelong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02983v1-abstract-short" style="display: inline;"> The application of intelligent decision-making in unmanned aerial vehicle (UAV) is increasing, and with the development of UAV 1v1 pursuit-evasion game, multi-UAV cooperative game has emerged as a new challenge. This paper proposes a deep reinforcement learning-based model for decision-making in multi-role UAV cooperative pursuit-evasion game, to address the challenge of enabling UAV to autonomous… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02983v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02983v1-abstract-full" style="display: none;"> The application of intelligent decision-making in unmanned aerial vehicle (UAV) is increasing, and with the development of UAV 1v1 pursuit-evasion game, multi-UAV cooperative game has emerged as a new challenge. This paper proposes a deep reinforcement learning-based model for decision-making in multi-role UAV cooperative pursuit-evasion game, to address the challenge of enabling UAV to autonomously make decisions in complex game environments. In order to enhance the training efficiency of the reinforcement learning algorithm in UAV pursuit-evasion game environment that has high-dimensional state-action space, this paper proposes multi-environment asynchronous double deep Q-network with priority experience replay algorithm to effectively train the UAV's game policy. Furthermore, aiming to improve cooperation ability and task completion efficiency, as well as minimize the cost of UAVs in the pursuit-evasion game, this paper focuses on the allocation of roles and targets within multi-UAV environment. The cooperative game decision model with varying numbers of UAVs are obtained by assigning diverse tasks and roles to the UAVs in different scenarios. The simulation results demonstrate that the proposed method enables autonomous decision-making of the UAVs in pursuit-evasion game scenarios and exhibits significant capabilities in cooperation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02983v1-abstract-full').style.display = 'none'; document.getElementById('2411.02983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 12 figures, 31 conference</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.8 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02715">arXiv:2411.02715</a> <span> [<a href="https://arxiv.org/pdf/2411.02715">pdf</a>, <a href="https://arxiv.org/format/2411.02715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CIT: Rethinking Class-incremental Semantic Segmentation with a Class Independent Transformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jinchao Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Akide Liu</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+M+H">Minh Hieu Phan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Y">Yangyang Shu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02715v1-abstract-short" style="display: inline;"> Class-incremental semantic segmentation (CSS) requires that a model learn to segment new classes without forgetting how to segment previous ones: this is typically achieved by distilling the current knowledge and incorporating the latest data. However, bypassing iterative distillation by directly transferring outputs of initial classes to the current learning task is not supported in existing clas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02715v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02715v1-abstract-full" style="display: none;"> Class-incremental semantic segmentation (CSS) requires that a model learn to segment new classes without forgetting how to segment previous ones: this is typically achieved by distilling the current knowledge and incorporating the latest data. However, bypassing iterative distillation by directly transferring outputs of initial classes to the current learning task is not supported in existing class-specific CSS methods. Via Softmax, they enforce dependency between classes and adjust the output distribution at each learning step, resulting in a large probability distribution gap between initial and current tasks. We introduce a simple, yet effective Class Independent Transformation (CIT) that converts the outputs of existing semantic segmentation models into class-independent forms with negligible cost or performance loss. By utilizing class-independent predictions facilitated by CIT, we establish an accumulative distillation framework, ensuring equitable incorporation of all class information. We conduct extensive experiments on various segmentation architectures, including DeepLabV3, Mask2Former, and SegViTv2. Results from these experiments show minimal task forgetting across different datasets, with less than 5% for ADE20K in the most challenging 11 task configurations and less than 1% across all configurations for the PASCAL VOC 2012 dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02715v1-abstract-full').style.display = 'none'; document.getElementById('2411.02715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02385">arXiv:2411.02385</a> <span> [<a href="https://arxiv.org/pdf/2411.02385">pdf</a>, <a href="https://arxiv.org/format/2411.02385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> How Far is Video Generation from World Model: A Physical Law Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+B">Bingyi Kang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yang Yue</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+R">Rui Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhijie Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaixin Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+G">Gao Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jiashi Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02385v1-abstract-short" style="display: inline;"> OpenAI's Sora highlights the potential of video generation for developing world models that adhere to fundamental physical laws. However, the ability of video generation models to discover such laws purely from visual data without human priors can be questioned. A world model learning the true law should give predictions robust to nuances and correctly extrapolate on unseen scenarios. In this work… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02385v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02385v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02385v1-abstract-full" style="display: none;"> OpenAI's Sora highlights the potential of video generation for developing world models that adhere to fundamental physical laws. However, the ability of video generation models to discover such laws purely from visual data without human priors can be questioned. A world model learning the true law should give predictions robust to nuances and correctly extrapolate on unseen scenarios. In this work, we evaluate across three key scenarios: in-distribution, out-of-distribution, and combinatorial generalization. We developed a 2D simulation testbed for object movement and collisions to generate videos deterministically governed by one or more classical mechanics laws. This provides an unlimited supply of data for large-scale experimentation and enables quantitative evaluation of whether the generated videos adhere to physical laws. We trained diffusion-based video generation models to predict object movements based on initial frames. Our scaling experiments show perfect generalization within the distribution, measurable scaling behavior for combinatorial generalization, but failure in out-of-distribution scenarios. Further experiments reveal two key insights about the generalization mechanisms of these models: (1) the models fail to abstract general physical rules and instead exhibit "case-based" generalization behavior, i.e., mimicking the closest training example; (2) when generalizing to new cases, models are observed to prioritize different factors when referencing training data: color > size > velocity > shape. Our study suggests that scaling alone is insufficient for video generation models to uncover fundamental physical laws, despite its role in Sora's broader success. See our project page at https://phyworld.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02385v1-abstract-full').style.display = 'none'; document.getElementById('2411.02385v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02319">arXiv:2411.02319</a> <span> [<a href="https://arxiv.org/pdf/2411.02319">pdf</a>, <a href="https://arxiv.org/format/2411.02319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GenXD: Generating Any 3D and 4D Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chung-Ching Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kevin Lin</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhiwen Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02319v2-abstract-short" style="display: inline;"> Recent developments in 2D visual generation have been remarkably successful. However, 3D and 4D generation remain challenging in real-world applications due to the lack of large-scale 4D data and effective model design. In this paper, we propose to jointly investigate general 3D and 4D generation by leveraging camera and object movements commonly observed in daily life. Due to the lack of real-wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02319v2-abstract-full').style.display = 'inline'; document.getElementById('2411.02319v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02319v2-abstract-full" style="display: none;"> Recent developments in 2D visual generation have been remarkably successful. However, 3D and 4D generation remain challenging in real-world applications due to the lack of large-scale 4D data and effective model design. In this paper, we propose to jointly investigate general 3D and 4D generation by leveraging camera and object movements commonly observed in daily life. Due to the lack of real-world 4D data in the community, we first propose a data curation pipeline to obtain camera poses and object motion strength from videos. Based on this pipeline, we introduce a large-scale real-world 4D scene dataset: CamVid-30K. By leveraging all the 3D and 4D data, we develop our framework, GenXD, which allows us to produce any 3D or 4D scene. We propose multiview-temporal modules, which disentangle camera and object movements, to seamlessly learn from both 3D and 4D data. Additionally, GenXD employs masked latent conditions to support a variety of conditioning views. GenXD can generate videos that follow the camera trajectory as well as consistent 3D views that can be lifted into 3D representations. We perform extensive evaluations across various real-world and synthetic datasets, demonstrating GenXD's effectiveness and versatility compared to previous methods in 3D and 4D generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02319v2-abstract-full').style.display = 'none'; document.getElementById('2411.02319v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02181">arXiv:2411.02181</a> <span> [<a href="https://arxiv.org/pdf/2411.02181">pdf</a>, <a href="https://arxiv.org/ps/2411.02181">ps</a>, <a href="https://arxiv.org/format/2411.02181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Detect an Object At Once without Fine-tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+J">Junyu Hao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yongjia Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zuofan Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qi Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinlong Chen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jianguo Wei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Minghao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02181v1-abstract-short" style="display: inline;"> When presented with one or a few photos of a previously unseen object, humans can instantly recognize it in different scenes. Although the human brain mechanism behind this phenomenon is still not fully understood, this work introduces a novel technical realization of this task. It consists of two phases: (1) generating a Similarity Density Map (SDM) by convolving the scene image with the given ob… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02181v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02181v1-abstract-full" style="display: none;"> When presented with one or a few photos of a previously unseen object, humans can instantly recognize it in different scenes. Although the human brain mechanism behind this phenomenon is still not fully understood, this work introduces a novel technical realization of this task. It consists of two phases: (1) generating a Similarity Density Map (SDM) by convolving the scene image with the given object image patch(es) so that the highlight areas in the SDM indicate the possible locations; (2) obtaining the object occupied areas in the scene through a Region Alignment Network (RAN). The RAN is constructed on a backbone of Deep Siamese Network (DSN), and different from the traditional DSNs, it aims to obtain the object accurate regions by regressing the location and area differences between the ground truths and the predicted ones indicated by the highlight areas in SDM. By pre-learning from labels annotated in traditional datasets, the SDM-RAN can detect previously unknown objects without fine-tuning. Experiments were conducted on the MS COCO, PASCAL VOC datasets. The results indicate that the proposed method outperforms state-of-the-art methods on the same task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02181v1-abstract-full').style.display = 'none'; document.getElementById('2411.02181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02179">arXiv:2411.02179</a> <span> [<a href="https://arxiv.org/pdf/2411.02179">pdf</a>, <a href="https://arxiv.org/format/2411.02179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> CleAR: Robust Context-Guided Generative Lighting Estimation for Mobile Augmented Reality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yiqin Zhao</a>, <a href="/search/cs?searchtype=author&query=Dasari%2C+M">Mallesham Dasari</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+T">Tian Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02179v1-abstract-short" style="display: inline;"> High-quality environment lighting is the foundation of creating immersive user experiences in mobile augmented reality (AR) applications. However, achieving visually coherent environment lighting estimation for Mobile AR is challenging due to several key limitations associated with AR device sensing capabilities, including limitations in device camera FoV and pixel dynamic ranges. Recent advanceme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02179v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02179v1-abstract-full" style="display: none;"> High-quality environment lighting is the foundation of creating immersive user experiences in mobile augmented reality (AR) applications. However, achieving visually coherent environment lighting estimation for Mobile AR is challenging due to several key limitations associated with AR device sensing capabilities, including limitations in device camera FoV and pixel dynamic ranges. Recent advancements in generative AI, which can generate high-quality images from different types of prompts, including texts and images, present a potential solution for high-quality lighting estimation. Still, to effectively use generative image diffusion models, we must address their key limitations of generation hallucination and slow inference process. To do so, in this work, we design and implement a generative lighting estimation system called CleAR that can produce high-quality and diverse environment maps in the format of 360$^\circ$ images. Specifically, we design a two-step generation pipeline guided by AR environment context data to ensure the results follow physical environment visual context and color appearances. To improve the estimation robustness under different lighting conditions, we design a real-time refinement component to adjust lighting estimation results on AR devices. To train and test our generative models, we curate a large-scale environment lighting estimation dataset with diverse lighting conditions. Through quantitative evaluation and user study, we show that CleAR outperforms state-of-the-art lighting estimation methods on both estimation accuracy and robustness. Moreover, CleAR supports real-time refinement of lighting estimation results, ensuring robust and timely environment lighting updates for AR applications. Our end-to-end generative estimation takes as fast as 3.2 seconds, outperforming state-of-the-art methods by 110x. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02179v1-abstract-full').style.display = 'none'; document.getElementById('2411.02179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02093">arXiv:2411.02093</a> <span> [<a href="https://arxiv.org/pdf/2411.02093">pdf</a>, <a href="https://arxiv.org/format/2411.02093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Do Advanced Language Models Eliminate the Need for Prompt Engineering in Software Engineering? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoqing Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zeyu Sun</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zhihao Gong</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Sixiang Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yizhou Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yifan Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Q">Qingyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+D">Dan Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02093v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have significantly advanced software engineering (SE) tasks, with prompt engineering techniques enhancing their performance in code-related areas. However, the rapid development of foundational LLMs such as the non-reasoning model GPT-4o and the reasoning model o1 raises questions about the continued effectiveness of these prompt engineering techniques. This paper pres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02093v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02093v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have significantly advanced software engineering (SE) tasks, with prompt engineering techniques enhancing their performance in code-related areas. However, the rapid development of foundational LLMs such as the non-reasoning model GPT-4o and the reasoning model o1 raises questions about the continued effectiveness of these prompt engineering techniques. This paper presents an extensive empirical study that reevaluates various prompt engineering techniques within the context of these advanced LLMs. Focusing on three representative SE tasks, i.e., code generation, code translation, and code summarization, we assess whether prompt engineering techniques still yield improvements with advanced models, the actual effectiveness of reasoning models compared to non-reasoning models, and whether the benefits of using these advanced models justify their increased costs. Our findings reveal that prompt engineering techniques developed for earlier LLMs may provide diminished benefits or even hinder performance when applied to advanced models. In reasoning LLMs, the ability of sophisticated built-in reasoning reduces the impact of complex prompts, sometimes making simple zero-shot prompting more effective. Furthermore, while reasoning models outperform non-reasoning models in tasks requiring complex reasoning, they offer minimal advantages in tasks that do not need reasoning and may incur unnecessary costs. Based on our study, we provide practical guidance for practitioners on selecting appropriate prompt engineering techniques and foundational LLMs, considering factors such as task requirements, operational costs, and environmental impact. Our work contributes to a deeper understanding of effectively harnessing advanced LLMs in SE tasks, informing future research and application development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02093v1-abstract-full').style.display = 'none'; document.getElementById('2411.02093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01947">arXiv:2411.01947</a> <span> [<a href="https://arxiv.org/pdf/2411.01947">pdf</a>, <a href="https://arxiv.org/format/2411.01947">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HACD: Harnessing Attribute Semantics and Mesoscopic Structure for Community Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Anran Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingfen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuhan Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01947v1-abstract-short" style="display: inline;"> Community detection plays a pivotal role in uncovering closely connected subgraphs, aiding various real-world applications such as recommendation systems and anomaly detection. With the surge of rich information available for entities in real-world networks, the community detection problem in attributed networks has attracted widespread attention. While previous research has effectively leveraged… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01947v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01947v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01947v1-abstract-full" style="display: none;"> Community detection plays a pivotal role in uncovering closely connected subgraphs, aiding various real-world applications such as recommendation systems and anomaly detection. With the surge of rich information available for entities in real-world networks, the community detection problem in attributed networks has attracted widespread attention. While previous research has effectively leveraged network topology and attribute information for attributed community detection, these methods overlook two critical issues: (i) the semantic similarity between node attributes within the community, and (ii) the inherent mesoscopic structure, which differs from the pairwise connections of the micro-structure. To address these limitations, we propose HACD, a novel attributed community detection model based on heterogeneous graph attention networks. HACD treats node attributes as another type of node, constructs attributed networks into heterogeneous graph structures and employs attribute-level attention mechanisms to capture semantic similarity. Furthermore, HACD introduces a community membership function to explore mesoscopic community structures, enhancing the robustness of detected communities. Extensive experiments demonstrate the effectiveness and efficiency of HACD, outperforming state-of-the-art methods in attributed community detection tasks. Our code is publicly available at https://github.com/Anniran1/HACD1-wsdm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01947v1-abstract-full').style.display = 'none'; document.getElementById('2411.01947v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01822">arXiv:2411.01822</a> <span> [<a href="https://arxiv.org/pdf/2411.01822">pdf</a>, <a href="https://arxiv.org/format/2411.01822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Distribution alignment based transfer fusion frameworks on quantum devices for seeking quantum advantages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xi He</a>, <a href="/search/cs?searchtype=author&query=Du%2C+F">Feiyu Du</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaohan Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+T">Tao Lei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01822v1-abstract-short" style="display: inline;"> The scarcity of labelled data is specifically an urgent challenge in the field of quantum machine learning (QML). Two transfer fusion frameworks are proposed in this paper to predict the labels of a target domain data by aligning its distribution to a different but related labelled source domain on quantum devices. The frameworks fuses the quantum data from two different, but related domains throu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01822v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01822v1-abstract-full" style="display: none;"> The scarcity of labelled data is specifically an urgent challenge in the field of quantum machine learning (QML). Two transfer fusion frameworks are proposed in this paper to predict the labels of a target domain data by aligning its distribution to a different but related labelled source domain on quantum devices. The frameworks fuses the quantum data from two different, but related domains through a quantum information infusion channel. The predicting tasks in the target domain can be achieved with quantum advantages by post-processing quantum measurement results. One framework, the quantum basic linear algebra subroutines (QBLAS) based implementation, can theoretically achieve the procedure of transfer fusion with quadratic speedup on a universal quantum computer. In addition, the other framework, a hardware-scalable architecture, is implemented on the noisy intermediate-scale quantum (NISQ) devices through a variational hybrid quantum-classical procedure. Numerical experiments on the synthetic and handwritten digits datasets demonstrate that the variatioinal transfer fusion (TF) framework can reach state-of-the-art (SOTA) quantum DA method performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01822v1-abstract-full').style.display = 'none'; document.getElementById('2411.01822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01808">arXiv:2411.01808</a> <span> [<a href="https://arxiv.org/pdf/2411.01808">pdf</a>, <a href="https://arxiv.org/format/2411.01808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Fixing the Loose Brake: Exponential-Tailed Stopping Time in Best Arm Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Balagopalan%2C+K">Kapilan Balagopalan</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T+N">Tuan Ngo Nguyen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yao Zhao</a>, <a href="/search/cs?searchtype=author&query=Jun%2C+K">Kwang-Sung Jun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01808v1-abstract-short" style="display: inline;"> The best arm identification problem requires identifying the best alternative (i.e., arm) in active experimentation using the smallest number of experiments (i.e., arm pulls), which is crucial for cost-efficient and timely decision-making processes. In the fixed confidence setting, an algorithm must stop data-dependently and return the estimated best arm with a correctness guarantee. Since this st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01808v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01808v1-abstract-full" style="display: none;"> The best arm identification problem requires identifying the best alternative (i.e., arm) in active experimentation using the smallest number of experiments (i.e., arm pulls), which is crucial for cost-efficient and timely decision-making processes. In the fixed confidence setting, an algorithm must stop data-dependently and return the estimated best arm with a correctness guarantee. Since this stopping time is random, we desire its distribution to have light tails. Unfortunately, many existing studies focus on high probability or in expectation bounds on the stopping time, which allow heavy tails and, for high probability bounds, even not stopping at all. We first prove that this never-stopping event can indeed happen for some popular algorithms. Motivated by this, we propose algorithms that provably enjoy an exponential-tailed stopping time, which improves upon the polynomial tail bound reported by Kalyanakrishnan et al. (2012). The first algorithm is based on a fixed budget algorithm called Sequential Halving along with a doubling trick. The second algorithm is a meta algorithm that takes in any fixed confidence algorithm with a high probability stopping guarantee and turns it into one that enjoys an exponential-tailed stopping time. Our results imply that there is much more to be desired for contemporary fixed confidence algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01808v1-abstract-full').style.display = 'none'; document.getElementById('2411.01808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00330">arXiv:2411.00330</a> <span> [<a href="https://arxiv.org/pdf/2411.00330">pdf</a>, <a href="https://arxiv.org/format/2411.00330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multiple Information Prompt Learning for Cloth-Changing Person Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+S">Shengxun Wei</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zan Gao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yibo Zhao</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+W">Weili Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00330v1-abstract-short" style="display: inline;"> Cloth-changing person re-identification is a subject closer to the real world, which focuses on solving the problem of person re-identification after pedestrians change clothes. The primary challenge in this field is to overcome the complex interplay between intra-class and inter-class variations and to identify features that remain unaffected by changes in appearance. Sufficient data collection f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00330v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00330v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00330v1-abstract-full" style="display: none;"> Cloth-changing person re-identification is a subject closer to the real world, which focuses on solving the problem of person re-identification after pedestrians change clothes. The primary challenge in this field is to overcome the complex interplay between intra-class and inter-class variations and to identify features that remain unaffected by changes in appearance. Sufficient data collection for model training would significantly aid in addressing this problem. However, it is challenging to gather diverse datasets in practice. Current methods focus on implicitly learning identity information from the original image or introducing additional auxiliary models, which are largely limited by the quality of the image and the performance of the additional model. To address these issues, inspired by prompt learning, we propose a novel multiple information prompt learning (MIPL) scheme for cloth-changing person ReID, which learns identity robust features through the common prompt guidance of multiple messages. Specifically, the clothing information stripping (CIS) module is designed to decouple the clothing information from the original RGB image features to counteract the influence of clothing appearance. The Bio-guided attention (BGA) module is proposed to increase the learning intensity of the model for key information. A dual-length hybrid patch (DHP) module is employed to make the features have diverse coverage to minimize the impact of feature bias. Extensive experiments demonstrate that the proposed method outperforms all state-of-the-art methods on the LTCC, Celeb-reID, Celeb-reID-light, and CSCC datasets, achieving rank-1 scores of 74.8%, 73.3%, 66.0%, and 88.1%, respectively. When compared to AIM (CVPR23), ACID (TIP23), and SCNet (MM23), MIPL achieves rank-1 improvements of 11.3%, 13.8%, and 7.9%, respectively, on the PRCC dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00330v1-abstract-full').style.display = 'none'; document.getElementById('2411.00330v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24214">arXiv:2410.24214</a> <span> [<a href="https://arxiv.org/pdf/2410.24214">pdf</a>, <a href="https://arxiv.org/format/2410.24214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ARQ: A Mixed-Precision Quantization Framework for Accurate and Certifiably Robust DNNs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuchen Yang</a>, <a href="/search/cs?searchtype=author&query=Ugare%2C+S">Shubham Ugare</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yifan Zhao</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+G">Gagandeep Singh</a>, <a href="/search/cs?searchtype=author&query=Misailovic%2C+S">Sasa Misailovic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24214v1-abstract-short" style="display: inline;"> Mixed precision quantization has become an important technique for enabling the execution of deep neural networks (DNNs) on limited resource computing platforms. Traditional quantization methods have primarily concentrated on maintaining neural network accuracy, either ignoring the impact of quantization on the robustness of the network, or using only empirical techniques for improving robustness.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24214v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24214v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24214v1-abstract-full" style="display: none;"> Mixed precision quantization has become an important technique for enabling the execution of deep neural networks (DNNs) on limited resource computing platforms. Traditional quantization methods have primarily concentrated on maintaining neural network accuracy, either ignoring the impact of quantization on the robustness of the network, or using only empirical techniques for improving robustness. In contrast, techniques for robustness certification, which can provide strong guarantees about the robustness of DNNs have not been used during quantization due to their high computation cost. This paper introduces ARQ, an innovative mixed-precision quantization method that not only preserves the clean accuracy of the smoothed classifiers but also maintains their certified robustness. ARQ uses reinforcement learning to find accurate and robust DNN quantization, while efficiently leveraging randomized smoothing, a popular class of statistical DNN verification algorithms, to guide the search process. We compare ARQ with multiple state-of-the-art quantization techniques on several DNN architectures commonly used in quantization studies: ResNet-20 on CIFAR-10, ResNet-50 on ImageNet, and MobileNetV2 on ImageNet. We demonstrate that ARQ consistently performs better than these baselines across all the benchmarks and the input perturbation levels. In many cases, the performance of ARQ quantized networks can reach that of the original DNN with floating-point weights, but with only 1.5% instructions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24214v1-abstract-full').style.display = 'none'; document.getElementById('2410.24214v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24036">arXiv:2410.24036</a> <span> [<a href="https://arxiv.org/pdf/2410.24036">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> The Communal Loom: Integrating Tangible Interaction and Participatory Data Collection for Assessing Well-Being </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Parikh%2C+N">Niti Parikh</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yiran Zhao</a>, <a href="/search/cs?searchtype=author&query=Alinea-Bravo%2C+M">Maria Alinea-Bravo</a>, <a href="/search/cs?searchtype=author&query=Parikh%2C+T">Tapan Parikh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24036v1-abstract-short" style="display: inline;"> For most health or well-being interventions, the process of evaluation is distinct from the activity itself, both in terms of who is involved, and how the actual data is collected and analyzed. Tangible interaction affords the opportunity to combine direct and embodied collaboration with a holistic approach to data collection and evaluation. We demonstrate this potential by describing our experien… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24036v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24036v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24036v1-abstract-full" style="display: none;"> For most health or well-being interventions, the process of evaluation is distinct from the activity itself, both in terms of who is involved, and how the actual data is collected and analyzed. Tangible interaction affords the opportunity to combine direct and embodied collaboration with a holistic approach to data collection and evaluation. We demonstrate this potential by describing our experiences designing and using the Communal Loom, an artifact for art therapy that translates quantitative data to collectively woven artifacts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24036v1-abstract-full').style.display = 'none'; document.getElementById('2410.24036v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at CHI 2022 Tangible Interaction for Supporting Well-being Workshop</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> J.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23611">arXiv:2410.23611</a> <span> [<a href="https://arxiv.org/pdf/2410.23611">pdf</a>, <a href="https://arxiv.org/ps/2410.23611">ps</a>, <a href="https://arxiv.org/format/2410.23611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Focal-free uniform hypergraphs and codes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xinqi Huang</a>, <a href="/search/cs?searchtype=author&query=Shangguan%2C+C">Chong Shangguan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiande Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuhao Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23611v1-abstract-short" style="display: inline;"> Motivated by the study of a variant of sunflowers, Alon and Holzman recently introduced focal-free hypergraphs. In this paper, we show that there is an interesting connection between the maximum size of focal-free hypergraphs and the renowned Erd艖s Matching Conjecture on the maximum number of edges that can be contained in a uniform hypergraph with bounded matching number. As a consequence, we giv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23611v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23611v1-abstract-full" style="display: none;"> Motivated by the study of a variant of sunflowers, Alon and Holzman recently introduced focal-free hypergraphs. In this paper, we show that there is an interesting connection between the maximum size of focal-free hypergraphs and the renowned Erd艖s Matching Conjecture on the maximum number of edges that can be contained in a uniform hypergraph with bounded matching number. As a consequence, we give asymptotically optimal bounds on the maximum sizes of focal-free uniform hypergraphs and codes, thereby significantly improving the previous results of Alon and Holzman. Moreover, by using the existentce results of combinatorial designs and orthogonal arrays, we are able to explicitly determine the exact sizes of maximum focal-free uniform hypergraphs and codes for a wide range of parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23611v1-abstract-full').style.display = 'none'; document.getElementById('2410.23611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23569">arXiv:2410.23569</a> <span> [<a href="https://arxiv.org/pdf/2410.23569">pdf</a>, <a href="https://arxiv.org/format/2410.23569">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RA-PbRL: Provably Efficient Risk-Aware Preference-Based Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yujie Zhao</a>, <a href="/search/cs?searchtype=author&query=Escamill%2C+J+E+A">Jose Efraim Aguilar Escamill</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Weyl Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huazheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23569v1-abstract-short" style="display: inline;"> Preference-based Reinforcement Learning (PbRL) studies the problem where agents receive only preferences over pairs of trajectories in each episode. Traditional approaches in this field have predominantly focused on the mean reward or utility criterion. However, in PbRL scenarios demanding heightened risk awareness, such as in AI systems, healthcare, and agriculture, risk-aware measures are requis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23569v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23569v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23569v1-abstract-full" style="display: none;"> Preference-based Reinforcement Learning (PbRL) studies the problem where agents receive only preferences over pairs of trajectories in each episode. Traditional approaches in this field have predominantly focused on the mean reward or utility criterion. However, in PbRL scenarios demanding heightened risk awareness, such as in AI systems, healthcare, and agriculture, risk-aware measures are requisite. Traditional risk-aware objectives and algorithms are not applicable in such one-episode-reward settings. To address this, we explore and prove the applicability of two risk-aware objectives to PbRL: nested and static quantile risk objectives. We also introduce Risk-Aware- PbRL (RA-PbRL), an algorithm designed to optimize both nested and static objectives. Additionally, we provide a theoretical analysis of the regret upper bounds, demonstrating that they are sublinear with respect to the number of episodes, and present empirical results to support our findings. Our code is available in https://github.com/aguilarjose11/PbRLNeurips. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23569v1-abstract-full').style.display = 'none'; document.getElementById('2410.23569v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23537">arXiv:2410.23537</a> <span> [<a href="https://arxiv.org/pdf/2410.23537">pdf</a>, <a href="https://arxiv.org/format/2410.23537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ALISE: Accelerating Large Language Model Serving with Speculative Scheduling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Youpeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23537v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) represent a revolutionary advancement in the contemporary landscape of artificial general intelligence (AGI). As exemplified by ChatGPT, LLM-based applications necessitate minimal response latency and maximal throughput for inference serving. However, due to the unpredictability of LLM execution, the first-come-first-serve (FCFS) scheduling policy employed by current L… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23537v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23537v1-abstract-full" style="display: none;"> Large Language Models (LLMs) represent a revolutionary advancement in the contemporary landscape of artificial general intelligence (AGI). As exemplified by ChatGPT, LLM-based applications necessitate minimal response latency and maximal throughput for inference serving. However, due to the unpredictability of LLM execution, the first-come-first-serve (FCFS) scheduling policy employed by current LLM serving systems suffers from head-of-line (HoL) blocking issues and long job response times. In this paper, we propose a new efficient LLM inference serving framework, named ALISE. The key design paradigm of ALISE is to leverage a novel speculative scheduler by estimating the execution time for each job and exploiting such prior knowledge to assign appropriate job priority orders, thus minimizing potential queuing delays for heterogeneous workloads. Furthermore, to mitigate the memory overhead of the intermediate key-value (KV) cache, we employ a priority-based adaptive memory management protocol and quantization-based compression techniques. Evaluations demonstrate that in comparison to the state-of-the-art solution vLLM, ALISE improves the throughput of inference serving by up to 1.8x and 2.1x under the same latency constraint on the Alpaca and ShareGPT datasets, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23537v1-abstract-full').style.display = 'none'; document.getElementById('2410.23537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCAD 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23266">arXiv:2410.23266</a> <span> [<a href="https://arxiv.org/pdf/2410.23266">pdf</a>, <a href="https://arxiv.org/format/2410.23266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TOMATO: Assessing Visual Temporal Reasoning Capabilities in Multimodal Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shangguan%2C+Z">Ziyao Shangguan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuhan Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yuxuan Ding</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yanan Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yilun Zhao</a>, <a href="/search/cs?searchtype=author&query=Fitzgerald%2C+T">Tesca Fitzgerald</a>, <a href="/search/cs?searchtype=author&query=Cohan%2C+A">Arman Cohan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23266v1-abstract-short" style="display: inline;"> Existing benchmarks often highlight the remarkable performance achieved by state-of-the-art Multimodal Foundation Models (MFMs) in leveraging temporal context for video understanding. However, how well do the models truly perform visual temporal reasoning? Our study of existing benchmarks shows that this capability of MFMs is likely overestimated as many questions can be solved by using a single,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23266v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23266v1-abstract-full" style="display: none;"> Existing benchmarks often highlight the remarkable performance achieved by state-of-the-art Multimodal Foundation Models (MFMs) in leveraging temporal context for video understanding. However, how well do the models truly perform visual temporal reasoning? Our study of existing benchmarks shows that this capability of MFMs is likely overestimated as many questions can be solved by using a single, few, or out-of-order frames. To systematically examine current visual temporal reasoning tasks, we propose three principles with corresponding metrics: (1) Multi-Frame Gain, (2) Frame Order Sensitivity, and (3) Frame Information Disparity. Following these principles, we introduce TOMATO, Temporal Reasoning Multimodal Evaluation, a novel benchmark crafted to rigorously assess MFMs' temporal reasoning capabilities in video understanding. TOMATO comprises 1,484 carefully curated, human-annotated questions spanning six tasks (i.e., action count, direction, rotation, shape & trend, velocity & frequency, and visual cues), applied to 1,417 videos, including 805 self-recorded and -generated videos, that encompass human-centric, real-world, and simulated scenarios. Our comprehensive evaluation reveals a human-model performance gap of 57.3% with the best-performing model. Moreover, our in-depth analysis uncovers more fundamental limitations beyond this gap in current MFMs. While they can accurately recognize events in isolated frames, they fail to interpret these frames as a continuous sequence. We believe TOMATO will serve as a crucial testbed for evaluating the next-generation MFMs and as a call to the community to develop AI systems capable of comprehending human world dynamics through the video modality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23266v1-abstract-full').style.display = 'none'; document.getElementById('2410.23266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23239">arXiv:2410.23239</a> <span> [<a href="https://arxiv.org/pdf/2410.23239">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> CRAFT@Large: Building Community Through Co-Making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yiran Zhao</a>, <a href="/search/cs?searchtype=author&query=Alinea-Bravo%2C+M">Maria Alinea-Bravo</a>, <a href="/search/cs?searchtype=author&query=Parikh%2C+N">Niti Parikh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23239v1-abstract-short" style="display: inline;"> CRAFT@Large (C@L) is an initiative launched by the MakerLAB at Cornell Tech to create an inclusive environment for the intercultural and intergenerational exchange of ideas through making. With our approach, we challenge the traditional definition of community outreach performed by academic makerspaces. Existing academic makerspaces often perform community engagement by only offering hourly, one-t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23239v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23239v1-abstract-full" style="display: none;"> CRAFT@Large (C@L) is an initiative launched by the MakerLAB at Cornell Tech to create an inclusive environment for the intercultural and intergenerational exchange of ideas through making. With our approach, we challenge the traditional definition of community outreach performed by academic makerspaces. Existing academic makerspaces often perform community engagement by only offering hourly, one-time workshops or by having community members provide a problem that is then used by students as a project assignment. These approaches position community members as occasional visitors and non-equal contributors, which not only conflict with the core values of co-creation but also limit the makerspaces' impact on connecting the universities and the communities. C@L explored an alternative approach in which we invited community members as long-term and equal co-makers into the academic makerspaces. In this article, we showcase two sets of collaborations that illustrate the continuity of people through co-making. We present how academic makerspaces can function as a hub that connects community members and partner organizations with the campus community in a long-term relationship. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23239v1-abstract-full').style.display = 'none'; document.getElementById('2410.23239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> K.4.3 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Symposium on Academic Makerspaces. 6 (2021) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23077">arXiv:2410.23077</a> <span> [<a href="https://arxiv.org/pdf/2410.23077">pdf</a>, <a href="https://arxiv.org/format/2410.23077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> First Place Solution to the ECCV 2024 ROAD++ Challenge @ ROAD++ Spatiotemporal Agent Detection 2024 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tengfei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Heng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruyang Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Q">Qi Deng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yaqian Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rengang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23077v1-abstract-short" style="display: inline;"> This report presents our team's solutions for the Track 1 of the 2024 ECCV ROAD++ Challenge. The task of Track 1 is spatiotemporal agent detection, which aims to construct an "agent tube" for road agents in consecutive video frames. Our solutions focus on the challenges in this task, including extreme-size objects, low-light scenarios, class imbalance, and fine-grained classification. Firstly, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23077v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23077v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23077v1-abstract-full" style="display: none;"> This report presents our team's solutions for the Track 1 of the 2024 ECCV ROAD++ Challenge. The task of Track 1 is spatiotemporal agent detection, which aims to construct an "agent tube" for road agents in consecutive video frames. Our solutions focus on the challenges in this task, including extreme-size objects, low-light scenarios, class imbalance, and fine-grained classification. Firstly, the extreme-size object detection heads are introduced to improve the detection performance of large and small objects. Secondly, we design a dual-stream detection model with a low-light enhancement stream to improve the performance of spatiotemporal agent detection in low-light scenes, and the feature fusion module to integrate features from different branches. Subsequently, we develop a multi-branch detection framework to mitigate the issues of class imbalance and fine-grained classification, and we design a pre-training and fine-tuning approach to optimize the above multi-branch framework. Besides, we employ some common data augmentation techniques, and improve the loss function and upsampling operation. We rank first in the test set of Track 1 for the ROAD++ Challenge 2024, and achieve 30.82% average video-mAP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23077v1-abstract-full').style.display = 'none'; document.getElementById('2410.23077v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22981">arXiv:2410.22981</a> <span> [<a href="https://arxiv.org/pdf/2410.22981">pdf</a>, <a href="https://arxiv.org/format/2410.22981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DisenTS: Disentangled Channel Evolving Pattern Modeling for Multivariate Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiding Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiqian Yang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Q">Qingyang Mao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuze Zhao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Mingyue Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+E">Enhong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22981v1-abstract-short" style="display: inline;"> Multivariate time series forecasting plays a crucial role in various real-world applications. Significant efforts have been made to integrate advanced network architectures and training strategies that enhance the capture of temporal dependencies, thereby improving forecasting accuracy. On the other hand, mainstream approaches typically utilize a single unified model with simplistic channel-mixing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22981v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22981v1-abstract-full" style="display: none;"> Multivariate time series forecasting plays a crucial role in various real-world applications. Significant efforts have been made to integrate advanced network architectures and training strategies that enhance the capture of temporal dependencies, thereby improving forecasting accuracy. On the other hand, mainstream approaches typically utilize a single unified model with simplistic channel-mixing embedding or cross-channel attention operations to account for the critical intricate inter-channel dependencies. Moreover, some methods even trade capacity for robust prediction based on the channel-independent assumption. Nonetheless, as time series data may display distinct evolving patterns due to the unique characteristics of each channel (including multiple strong seasonalities and trend changes), the unified modeling methods could yield suboptimal results. To this end, we propose DisenTS, a tailored framework for modeling disentangled channel evolving patterns in general multivariate time series forecasting. The central idea of DisenTS is to model the potential diverse patterns within the multivariate time series data in a decoupled manner. Technically, the framework employs multiple distinct forecasting models, each tasked with uncovering a unique evolving pattern. To guide the learning process without supervision of pattern partition, we introduce a novel Forecaster Aware Gate (FAG) module that generates the routing signals adaptively according to both the forecasters' states and input series' characteristics. The forecasters' states are derived from the Linear Weight Approximation (LWA) strategy, which quantizes the complex deep neural networks into compact matrices. Additionally, the Similarity Constraint (SC) is further proposed to guide each model to specialize in an underlying pattern by minimizing the mutual information between the representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22981v1-abstract-full').style.display = 'none'; document.getElementById('2410.22981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22821">arXiv:2410.22821</a> <span> [<a href="https://arxiv.org/pdf/2410.22821">pdf</a>, <a href="https://arxiv.org/format/2410.22821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> EvoCodeBench: An Evolving Code Generation Benchmark with Domain-Specific Evaluations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuanming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihong Dong</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhi Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Binhua Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongbin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22821v1-abstract-short" style="display: inline;"> How to evaluate Large Language Models (LLMs) in code generation remains an open question. Existing benchmarks have two limitations - data leakage and lack of domain-specific evaluation. The former hurts the fairness of benchmarks, and the latter hinders practitioners from selecting superior LLMs for specific programming domains. To address these two limitations, we propose a new benchmark - EvoCod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22821v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22821v1-abstract-full" style="display: none;"> How to evaluate Large Language Models (LLMs) in code generation remains an open question. Existing benchmarks have two limitations - data leakage and lack of domain-specific evaluation. The former hurts the fairness of benchmarks, and the latter hinders practitioners from selecting superior LLMs for specific programming domains. To address these two limitations, we propose a new benchmark - EvoCodeBench, which has the following advances: (1) Evolving data. EvoCodeBench will be dynamically updated every period (e.g., 6 months) to avoid data leakage. This paper releases the first version - EvoCodeBench-2403, containing 275 samples from 25 repositories. (2) A domain taxonomy and domain labels. Based on the statistics of open-source communities, we design a programming domain taxonomy consisting of 10 popular domains. Based on the taxonomy, we annotate each sample in EvoCodeBench with a domain label. (3) Domain-specific evaluations. Besides the Pass@k, we compute the Domain-Specific Improvement (DSI) and define LLMs' comfort and strange domains. These evaluations help practitioners select superior LLMs in specific domains and discover the shortcomings of existing LLMs. We evaluate 8 popular LLMs (e.g., gpt-4, DeepSeek Coder) on EvoCodeBench and summarize some insights. EvoCodeBench reveals the actual abilities of these LLMs in real-world repositories. For example, the highest Pass@1 of gpt-4 on EvoCodeBench-2403 is only 20.74%. Besides, we evaluate LLMs in different domains and discover their comfort and strange domains. For example, gpt-4 performs best in most domains but falls behind others in the Internet domain. StarCoder 2-15B unexpectedly performs well in the Database domain and even outperforms 33B LLMs. EvoCodeBench has been released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22821v1-abstract-full').style.display = 'none'; document.getElementById('2410.22821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21345">arXiv:2410.21345</a> <span> [<a href="https://arxiv.org/pdf/2410.21345">pdf</a>, <a href="https://arxiv.org/format/2410.21345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Genomics">q-bio.GN</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Absorb & Escape: Overcoming Single Model Limitations in Generating Genomic Sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zehui Li</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Y">Yuhao Ni</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+G">Guoxuan Xia</a>, <a href="/search/cs?searchtype=author&query=Beardall%2C+W">William Beardall</a>, <a href="/search/cs?searchtype=author&query=Das%2C+A">Akashaditya Das</a>, <a href="/search/cs?searchtype=author&query=Stan%2C+G">Guy-Bart Stan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yiren Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21345v1-abstract-short" style="display: inline;"> Abstract Recent advances in immunology and synthetic biology have accelerated the development of deep generative methods for DNA sequence design. Two dominant approaches in this field are AutoRegressive (AR) models and Diffusion Models (DMs). However, genomic sequences are functionally heterogeneous, consisting of multiple connected regions (e.g., Promoter Regions, Exons, and Introns) where elemen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21345v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21345v1-abstract-full" style="display: none;"> Abstract Recent advances in immunology and synthetic biology have accelerated the development of deep generative methods for DNA sequence design. Two dominant approaches in this field are AutoRegressive (AR) models and Diffusion Models (DMs). However, genomic sequences are functionally heterogeneous, consisting of multiple connected regions (e.g., Promoter Regions, Exons, and Introns) where elements within each region come from the same probability distribution, but the overall sequence is non-homogeneous. This heterogeneous nature presents challenges for a single model to accurately generate genomic sequences. In this paper, we analyze the properties of AR models and DMs in heterogeneous genomic sequence generation, pointing out crucial limitations in both methods: (i) AR models capture the underlying distribution of data by factorizing and learning the transition probability but fail to capture the global property of DNA sequences. (ii) DMs learn to recover the global distribution but tend to produce errors at the base pair level. To overcome the limitations of both approaches, we propose a post-training sampling method, termed Absorb & Escape (A&E) to perform compositional generation from AR models and DMs. This approach starts with samples generated by DMs and refines the sample quality using an AR model through the alternation of the Absorb and Escape steps. To assess the quality of generated sequences, we conduct extensive experiments on 15 species for conditional and unconditional DNA generation. The experiment results from motif distribution, diversity checks, and genome integration tests unequivocally show that A&E outperforms state-of-the-art AR models and DMs in genomic sequence generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21345v1-abstract-full').style.display = 'none'; document.getElementById('2410.21345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21340">arXiv:2410.21340</a> <span> [<a href="https://arxiv.org/pdf/2410.21340">pdf</a>, <a href="https://arxiv.org/format/2410.21340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Meta-Learning for Speeding Up Large Model Inference in Decentralized Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuzhe Yang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yipeng Du</a>, <a href="/search/cs?searchtype=author&query=Farhan%2C+A">Ahmad Farhan</a>, <a href="/search/cs?searchtype=author&query=Angione%2C+C">Claudio Angione</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Harry Yang</a>, <a href="/search/cs?searchtype=author&query=Johnston%2C+F">Fielding Johnston</a>, <a href="/search/cs?searchtype=author&query=Buban%2C+J">James Buban</a>, <a href="/search/cs?searchtype=author&query=Colangelo%2C+P">Patrick Colangelo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21340v1-abstract-short" style="display: inline;"> The deployment of large-scale models, such as large language models (LLMs) and sophisticated image generation systems, incurs substantial costs due to their computational demands. To mitigate these costs and address challenges related to scalability and data security, there is a growing shift towards decentralized systems for deploying such models. In these decentralized environments, efficient in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21340v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21340v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21340v1-abstract-full" style="display: none;"> The deployment of large-scale models, such as large language models (LLMs) and sophisticated image generation systems, incurs substantial costs due to their computational demands. To mitigate these costs and address challenges related to scalability and data security, there is a growing shift towards decentralized systems for deploying such models. In these decentralized environments, efficient inference acceleration becomes crucial to manage computational resources effectively and enhance system responsiveness. In this work, we address the challenge of selecting optimal acceleration methods in decentralized systems by introducing a meta-learning-based framework. This framework automates the selection process by learning from historical performance data of various acceleration techniques across different tasks. Unlike traditional methods that rely on random selection or expert intuition, our approach systematically identifies the best acceleration strategies based on the specific characteristics of each task. We demonstrate that our meta-learning framework not only streamlines the decision-making process but also consistently outperforms conventional methods in terms of efficiency and performance. Our results highlight the potential of meta-learning to revolutionize inference acceleration in decentralized AI systems, offering a path towards more democratic and economically feasible artificial intelligence solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21340v1-abstract-full').style.display = 'none'; document.getElementById('2410.21340v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21290">arXiv:2410.21290</a> <span> [<a href="https://arxiv.org/pdf/2410.21290">pdf</a>, <a href="https://arxiv.org/format/2410.21290">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Multiple Ships Cooperative Navigation and Collision Avoidance using Multi-agent Reinforcement Learning with Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Y. Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Y. Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21290v1-abstract-short" style="display: inline;"> In the real world, unmanned surface vehicles (USV) often need to coordinate with each other to accomplish specific tasks. However, achieving cooperative control in multi-agent systems is challenging due to issues such as non-stationarity and partial observability. Recent advancements in Multi-Agent Reinforcement Learning (MARL) provide new perspectives to address these challenges. Therefore, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21290v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21290v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21290v1-abstract-full" style="display: none;"> In the real world, unmanned surface vehicles (USV) often need to coordinate with each other to accomplish specific tasks. However, achieving cooperative control in multi-agent systems is challenging due to issues such as non-stationarity and partial observability. Recent advancements in Multi-Agent Reinforcement Learning (MARL) provide new perspectives to address these challenges. Therefore, we propose using the multi-agent deep deterministic policy gradient (MADDPG) algorithm with communication to address multiple ships' cooperation problems under partial observability. We developed two tasks based on OpenAI's gym environment: cooperative navigation and cooperative collision avoidance. In these tasks, ships must not only learn effective control strategies but also establish communication protocols with other agents. We analyze the impact of external noise on communication, the effect of inter-agent communication on performance, and the communication patterns learned by the agents. The results demonstrate that our proposed framework effectively addresses cooperative navigation and collision avoidance among multiple vessels, significantly outperforming traditional single-agent algorithms. Agents establish a consistent communication protocol, enabling them to compensate for missing information through shared observations and achieve better coordination. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21290v1-abstract-full').style.display = 'none'; document.getElementById('2410.21290v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 4 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>