Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,766 results for author: <span class="mathjax">Yang, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yang%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yang, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yang%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yang, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18428">arXiv:2411.18428</a> <span> [<a href="https://arxiv.org/pdf/2411.18428">pdf</a>, <a href="https://arxiv.org/format/2411.18428">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MM-Path: Multi-modal, Multi-granularity Path Representation Learning -- Extended Version </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ronghui Xu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hanyin Cheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chenjuan Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hongfan Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jilin Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S+B">Sean Bin Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18428v1-abstract-short" style="display: inline;"> Developing effective path representations has become increasingly essential across various fields within intelligent transportation. Although pre-trained path representation learning models have shown improved performance, they predominantly focus on the topological structures from single modality data, i.e., road networks, overlooking the geometric and contextual features associated with path-rel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18428v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18428v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18428v1-abstract-full" style="display: none;"> Developing effective path representations has become increasingly essential across various fields within intelligent transportation. Although pre-trained path representation learning models have shown improved performance, they predominantly focus on the topological structures from single modality data, i.e., road networks, overlooking the geometric and contextual features associated with path-related images, e.g., remote sensing images. Similar to human understanding, integrating information from multiple modalities can provide a more comprehensive view, enhancing both representation accuracy and generalization. However, variations in information granularity impede the semantic alignment of road network-based paths (road paths) and image-based paths (image paths), while the heterogeneity of multi-modal data poses substantial challenges for effective fusion and utilization. In this paper, we propose a novel Multi-modal, Multi-granularity Path Representation Learning Framework (MM-Path), which can learn a generic path representation by integrating modalities from both road paths and image paths. To enhance the alignment of multi-modal data, we develop a multi-granularity alignment strategy that systematically associates nodes, road sub-paths, and road paths with their corresponding image patches, ensuring the synchronization of both detailed local information and broader global contexts. To address the heterogeneity of multi-modal data effectively, we introduce a graph-based cross-modal residual fusion component designed to comprehensively fuse information across different modalities and granularities. Finally, we conduct extensive experiments on two large-scale real-world datasets under two downstream tasks, validating the effectiveness of the proposed MM-Path. This is an extended version of the paper accepted by KDD 2025. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18428v1-abstract-full').style.display = 'none'; document.getElementById('2411.18428v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18203">arXiv:2411.18203</a> <span> [<a href="https://arxiv.org/pdf/2411.18203">pdf</a>, <a href="https://arxiv.org/format/2411.18203">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Critic-V: VLM Critics Help Catch VLM Errors in Multimodal Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+J">Jingdi Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junxian Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xunzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yujie Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zonglin Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiatong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weida Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Suorong Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianbo Wu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+P">Peng Ye</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Dongzhan Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18203v1-abstract-short" style="display: inline;"> Vision-language models~(VLMs) have shown remarkable advancements in multimodal reasoning tasks. However, they still often generate inaccurate or irrelevant responses due to issues like hallucinated image understandings or unrefined reasoning paths. To address these challenges, we introduce Critic-V, a novel framework inspired by the Actor-Critic paradigm to boost the reasoning capability of VLMs.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18203v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18203v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18203v1-abstract-full" style="display: none;"> Vision-language models~(VLMs) have shown remarkable advancements in multimodal reasoning tasks. However, they still often generate inaccurate or irrelevant responses due to issues like hallucinated image understandings or unrefined reasoning paths. To address these challenges, we introduce Critic-V, a novel framework inspired by the Actor-Critic paradigm to boost the reasoning capability of VLMs. This framework decouples the reasoning process and critic process by integrating two independent components: the Reasoner, which generates reasoning paths based on visual and textual inputs, and the Critic, which provides constructive critique to refine these paths. In this approach, the Reasoner generates reasoning responses according to text prompts, which can evolve iteratively as a policy based on feedback from the Critic. This interaction process was theoretically driven by a reinforcement learning framework where the Critic offers natural language critiques instead of scalar rewards, enabling more nuanced feedback to boost the Reasoner's capability on complex reasoning tasks. The Critic model is trained using Direct Preference Optimization (DPO), leveraging a preference dataset of critiques ranked by Rule-based Reward(RBR) to enhance its critic capabilities. Evaluation results show that the Critic-V framework significantly outperforms existing methods, including GPT-4V, on 5 out of 8 benchmarks, especially regarding reasoning accuracy and efficiency. Combining a dynamic text-based policy for the Reasoner and constructive feedback from the preference-optimized Critic enables a more reliable and context-sensitive multimodal reasoning process. Our approach provides a promising solution to enhance the reliability of VLMs, improving their performance in real-world reasoning-heavy multimodal applications such as autonomous driving and embodied intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18203v1-abstract-full').style.display = 'none'; document.getElementById('2411.18203v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17776">arXiv:2411.17776</a> <span> [<a href="https://arxiv.org/pdf/2411.17776">pdf</a>, <a href="https://arxiv.org/format/2411.17776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Beyond Walking: A Large-Scale Image-Text Benchmark for Text-based Person Anomaly Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuyu Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaxiong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Li Zhu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhedong Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17776v1-abstract-short" style="display: inline;"> Text-based person search aims to retrieve specific individuals across camera networks using natural language descriptions. However, current benchmarks often exhibit biases towards common actions like walking or standing, neglecting the critical need for identifying abnormal behaviors in real-world scenarios. To meet such demands, we propose a new task, text-based person anomaly search, locating pe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17776v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17776v1-abstract-full" style="display: none;"> Text-based person search aims to retrieve specific individuals across camera networks using natural language descriptions. However, current benchmarks often exhibit biases towards common actions like walking or standing, neglecting the critical need for identifying abnormal behaviors in real-world scenarios. To meet such demands, we propose a new task, text-based person anomaly search, locating pedestrians engaged in both routine or anomalous activities via text. To enable the training and evaluation of this new task, we construct a large-scale image-text Pedestrian Anomaly Behavior (PAB) benchmark, featuring a broad spectrum of actions, e.g., running, performing, playing soccer, and the corresponding anomalies, e.g., lying, being hit, and falling of the same identity. The training set of PAB comprises 1,013,605 synthesized image-text pairs of both normalities and anomalies, while the test set includes 1,978 real-world image-text pairs. To validate the potential of PAB, we introduce a cross-modal pose-aware framework, which integrates human pose patterns with identity-based hard negative pair sampling. Extensive experiments on the proposed benchmark show that synthetic training data facilitates the fine-grained behavior retrieval in the real-world test set, while the proposed pose-aware method further improves the recall@1 by 2.88%. We will release the dataset, code, and checkpoints to facilitate further research and ensure the reproducibility of our results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17776v1-abstract-full').style.display = 'none'; document.getElementById('2411.17776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17766">arXiv:2411.17766</a> <span> [<a href="https://arxiv.org/pdf/2411.17766">pdf</a>, <a href="https://arxiv.org/format/2411.17766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Integrating Dual Prototypes for Task-Wise Adaption in Pre-Trained Model-Based Class-Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiming Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Suorong Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Baile Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jian Zhao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+F">Furao Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17766v1-abstract-short" style="display: inline;"> Class-incremental learning (CIL) aims to acquire new classes while conserving historical knowledge incrementally. Despite existing pre-trained model (PTM) based methods performing excellently in CIL, it is better to fine-tune them on downstream incremental tasks with massive patterns unknown to PTMs. However, using task streams for fine-tuning could lead to catastrophic forgetting that will erase… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17766v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17766v1-abstract-full" style="display: none;"> Class-incremental learning (CIL) aims to acquire new classes while conserving historical knowledge incrementally. Despite existing pre-trained model (PTM) based methods performing excellently in CIL, it is better to fine-tune them on downstream incremental tasks with massive patterns unknown to PTMs. However, using task streams for fine-tuning could lead to catastrophic forgetting that will erase the knowledge in PTMs. This paper proposes the Dual Prototype network for Task-wise Adaption (DPTA) of PTM-based CIL. For each incremental learning task, a task-wise adapter module is built to fine-tune the PTM, where the center-adapt loss forces the representation to be more centrally clustered and class separable. The dual prototype network improves the prediction process by enabling test-time adapter selection, where the raw prototypes deduce several possible task indexes of test samples to select suitable adapter modules for PTM, and the augmented prototypes that could separate highly correlated classes are utilized to determine the final result. Experiments on several benchmark datasets demonstrate the state-of-the-art performance of DPTA. The code will be open-sourced after the paper is published. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17766v1-abstract-full').style.display = 'none'; document.getElementById('2411.17766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages,6 figures,2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17413">arXiv:2411.17413</a> <span> [<a href="https://arxiv.org/pdf/2411.17413">pdf</a>, <a href="https://arxiv.org/ps/2411.17413">ps</a>, <a href="https://arxiv.org/format/2411.17413">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Evaluating the Overhead of the Performance Profiler Cloudprofiler With MooBench </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shinhyung Yang</a>, <a href="/search/cs?searchtype=author&query=Reichelt%2C+D+G">David Georg Reichelt</a>, <a href="/search/cs?searchtype=author&query=Hasselbring%2C+W">Wilhelm Hasselbring</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17413v1-abstract-short" style="display: inline;"> Performance engineering has become crucial for the cloud-native architecture. This architecture deploys multiple services, with each service representing an orchestration of containerized processes. OpenTelemetry is growing popular in the cloud-native industry for observing the software's behaviour, and Kieker provides the necessary tools to monitor and analyze the performance of target architectu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17413v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17413v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17413v1-abstract-full" style="display: none;"> Performance engineering has become crucial for the cloud-native architecture. This architecture deploys multiple services, with each service representing an orchestration of containerized processes. OpenTelemetry is growing popular in the cloud-native industry for observing the software's behaviour, and Kieker provides the necessary tools to monitor and analyze the performance of target architectures. Observability overhead is an important aspect of performance engineering and MooBench is designed to compare different observability frameworks, including OpenTelemetry and Kieker. In this work, we measure the overhead of Cloudprofiler, a performance profiler implemented in C++ to measure native and JVM processes. It minimizes the profiling overhead by locating the profiler process outside the target process and moving the disk writing overhead off the critical path with buffer blocks and compression threads. Using MooBench, Cloudprofiler's buffered ID handler with the Zstandard lossless data compression ZSTD showed an average execution time of 2.28 microseconds. It is 6.15 times faster than the non-buffered and non-compression handler. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17413v1-abstract-full').style.display = 'none'; document.getElementById('2411.17413v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.5; D.2.8; D.2.9 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17339">arXiv:2411.17339</a> <span> [<a href="https://arxiv.org/pdf/2411.17339">pdf</a>, <a href="https://arxiv.org/format/2411.17339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Knowledge-aware Evolutionary Graph Neural Architecture Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiaxuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lingling Li</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+L">Licheng Jiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xu Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuyuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17339v1-abstract-short" style="display: inline;"> Graph neural architecture search (GNAS) can customize high-performance graph neural network architectures for specific graph tasks or datasets. However, existing GNAS methods begin searching for architectures from a zero-knowledge state, ignoring the prior knowledge that may improve the search efficiency. The available knowledge base (e.g. NAS-Bench-Graph) contains many rich architectures and thei… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17339v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17339v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17339v1-abstract-full" style="display: none;"> Graph neural architecture search (GNAS) can customize high-performance graph neural network architectures for specific graph tasks or datasets. However, existing GNAS methods begin searching for architectures from a zero-knowledge state, ignoring the prior knowledge that may improve the search efficiency. The available knowledge base (e.g. NAS-Bench-Graph) contains many rich architectures and their multiple performance metrics, such as the accuracy (#Acc) and number of parameters (#Params). This study proposes exploiting such prior knowledge to accelerate the multi-objective evolutionary search on a new graph dataset, named knowledge-aware evolutionary GNAS (KEGNAS). KEGNAS employs the knowledge base to train a knowledge model and a deep multi-output Gaussian process (DMOGP) in one go, which generates and evaluates transfer architectures in only a few GPU seconds. The knowledge model first establishes a dataset-to-architecture mapping, which can quickly generate candidate transfer architectures for a new dataset. Subsequently, the DMOGP with architecture and dataset encodings is designed to predict multiple performance metrics for candidate transfer architectures on the new dataset. According to the predicted metrics, non-dominated candidate transfer architectures are selected to warm-start the multi-objective evolutionary algorithm for optimizing the #Acc and #Params on a new dataset. Empirical studies on NAS-Bench-Graph and five real-world datasets show that KEGNAS swiftly generates top-performance architectures, achieving 4.27% higher accuracy than advanced evolutionary baselines and 11.54% higher accuracy than advanced differentiable baselines. In addition, ablation studies demonstrate that the use of prior knowledge significantly improves the search performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17339v1-abstract-full').style.display = 'none'; document.getElementById('2411.17339v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been accepted by Knowledge-Based Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17125">arXiv:2411.17125</a> <span> [<a href="https://arxiv.org/pdf/2411.17125">pdf</a>, <a href="https://arxiv.org/format/2411.17125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DOGE: Towards Versatile Visual Document Grounding and Referring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yinan Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haokun Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuyu Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Li Zhu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zhongang Qi</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chen Ma</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Ying Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17125v1-abstract-short" style="display: inline;"> In recent years, Multimodal Large Language Models (MLLMs) have increasingly emphasized grounding and referring capabilities to achieve detailed understanding and flexible user interaction. However, in the realm of visual document understanding, these capabilities lag behind due to the scarcity of fine-grained datasets and comprehensive benchmarks. To fill this gap, we propose the DOcument Groundin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17125v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17125v1-abstract-full" style="display: none;"> In recent years, Multimodal Large Language Models (MLLMs) have increasingly emphasized grounding and referring capabilities to achieve detailed understanding and flexible user interaction. However, in the realm of visual document understanding, these capabilities lag behind due to the scarcity of fine-grained datasets and comprehensive benchmarks. To fill this gap, we propose the DOcument Grounding and Eferring data engine (DOGE-Engine), which produces two types of high-quality fine-grained document data: multi-granular parsing data for enhancing fundamental text localization and recognition capabilities; and instruction-tuning data to activate MLLM's grounding and referring capabilities during dialogue and reasoning. Additionally, using our engine, we construct DOGE-Bench, which encompasses 7 grounding and referring tasks across 3 document types (chart, poster, PDF document), providing comprehensive evaluations for fine-grained document understanding. Furthermore, leveraging the data generated by our engine, we develop a strong baseline model, DOGE. This pioneering MLLM is capable of accurately referring and grounding texts at multiple granularities within document images. Our code, data, and model will be open-sourced for community development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17125v1-abstract-full').style.display = 'none'; document.getElementById('2411.17125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17058">arXiv:2411.17058</a> <span> [<a href="https://arxiv.org/pdf/2411.17058">pdf</a>, <a href="https://arxiv.org/format/2411.17058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ThreatModeling-LLM: Automating Threat Modeling using Large Language Models for Banking System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuiqiao Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tingmin Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shigang Liu</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+D">David Nguyen</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+S">Seung Jang</a>, <a href="/search/cs?searchtype=author&query=Abuadbba%2C+A">Alsharif Abuadbba</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17058v1-abstract-short" style="display: inline;"> Threat modeling is a crucial component of cybersecurity, particularly for industries such as banking, where the security of financial data is paramount. Traditional threat modeling approaches require expert intervention and manual effort, often leading to inefficiencies and human error. The advent of Large Language Models (LLMs) offers a promising avenue for automating these processes, enhancing b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17058v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17058v1-abstract-full" style="display: none;"> Threat modeling is a crucial component of cybersecurity, particularly for industries such as banking, where the security of financial data is paramount. Traditional threat modeling approaches require expert intervention and manual effort, often leading to inefficiencies and human error. The advent of Large Language Models (LLMs) offers a promising avenue for automating these processes, enhancing both efficiency and efficacy. However, this transition is not straightforward due to three main challenges: (1) the lack of publicly available, domain-specific datasets, (2) the need for tailored models to handle complex banking system architectures, and (3) the requirement for real-time, adaptive mitigation strategies that align with compliance standards like NIST 800-53. In this paper, we introduce ThreatModeling-LLM, a novel and adaptable framework that automates threat modeling for banking systems using LLMs. ThreatModeling-LLM operates in three stages: 1) dataset creation, 2) prompt engineering and 3) model fine-tuning. We first generate a benchmark dataset using Microsoft Threat Modeling Tool (TMT). Then, we apply Chain of Thought (CoT) and Optimization by PROmpting (OPRO) on the pre-trained LLMs to optimize the initial prompt. Lastly, we fine-tune the LLM using Low-Rank Adaptation (LoRA) based on the benchmark dataset and the optimized prompt to improve the threat identification and mitigation generation capabilities of pre-trained LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17058v1-abstract-full').style.display = 'none'; document.getElementById('2411.17058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16679">arXiv:2411.16679</a> <span> [<a href="https://arxiv.org/pdf/2411.16679">pdf</a>, <a href="https://arxiv.org/format/2411.16679">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Do Large Language Models Perform Latent Multi-Hop Reasoning without Exploiting Shortcuts? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sohee Yang</a>, <a href="/search/cs?searchtype=author&query=Kassner%2C+N">Nora Kassner</a>, <a href="/search/cs?searchtype=author&query=Gribovskaya%2C+E">Elena Gribovskaya</a>, <a href="/search/cs?searchtype=author&query=Riedel%2C+S">Sebastian Riedel</a>, <a href="/search/cs?searchtype=author&query=Geva%2C+M">Mor Geva</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16679v1-abstract-short" style="display: inline;"> We evaluate how well Large Language Models (LLMs) latently recall and compose facts to answer multi-hop queries like "In the year Scarlett Johansson was born, the Summer Olympics were hosted in the country of". One major challenge in evaluating this ability is that LLMs may have developed shortcuts by encounters of the head entity "Scarlett Johansson" and the answer entity "United States" in the s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16679v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16679v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16679v1-abstract-full" style="display: none;"> We evaluate how well Large Language Models (LLMs) latently recall and compose facts to answer multi-hop queries like "In the year Scarlett Johansson was born, the Summer Olympics were hosted in the country of". One major challenge in evaluating this ability is that LLMs may have developed shortcuts by encounters of the head entity "Scarlett Johansson" and the answer entity "United States" in the same training sequences or merely guess the answer based on frequency-based priors. To prevent shortcuts, we exclude test queries where the head and answer entities co-appear in pretraining corpora. Through careful selection of relations and facts and systematic removal of cases where models might guess answers or exploit partial matches, we construct an evaluation dataset SOCRATES (ShOrtCut-fRee lATent rEaSoning). We observe that LLMs demonstrate promising latent multi-hop reasoning abilities without exploiting shortcuts, but only for certain types of queries. For queries requiring latent recall of countries as the intermediate answer, the best models achieve 80% latent composability, but this drops to just 5% for the recall of years. Comparisons with Chain-of-Thought composability highlight a significant gap between the ability of models to reason latently versus explicitly. Analysis reveals that latent representations of the intermediate answer are constructed more often in queries with higher latent composability, and shows the emergence of latent multi-hop reasoning during pretraining. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16679v1-abstract-full').style.display = 'none'; document.getElementById('2411.16679v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16102">arXiv:2411.16102</a> <span> [<a href="https://arxiv.org/pdf/2411.16102">pdf</a>, <a href="https://arxiv.org/format/2411.16102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BlendServe: Optimizing Offline Inference for Auto-regressive Large Models with Resource-aware Batching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yilong Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kan Zhu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Lianmin Zheng</a>, <a href="/search/cs?searchtype=author&query=Kasikci%2C+B">Baris Kasikci</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+J">Jiarong Xing</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16102v1-abstract-short" style="display: inline;"> Offline batch inference, which leverages the flexibility of request batching to achieve higher throughput and lower costs, is becoming more popular for latency-insensitive applications. Meanwhile, recent progress in model capability and modality makes requests more diverse in compute and memory demands, creating unique opportunities for throughput improvement by resource overlapping. However, a re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16102v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16102v1-abstract-full" style="display: none;"> Offline batch inference, which leverages the flexibility of request batching to achieve higher throughput and lower costs, is becoming more popular for latency-insensitive applications. Meanwhile, recent progress in model capability and modality makes requests more diverse in compute and memory demands, creating unique opportunities for throughput improvement by resource overlapping. However, a request schedule that maximizes resource overlapping can conflict with the schedule that maximizes prefix sharing, a widely-used performance optimization, causing sub-optimal inference throughput. We present BlendServe, a system that maximizes resource utilization of offline batch inference by combining the benefits of resource overlapping and prefix sharing using a resource-aware prefix tree. BlendServe exploits the relaxed latency requirements in offline batch inference to reorder and overlap requests with varied resource demands while ensuring high prefix sharing. We evaluate BlendServe on a variety of synthetic multi-modal workloads and show that it provides up to $1.44\times$ throughput boost compared to widely-used industry standards, vLLM and SGLang. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16102v1-abstract-full').style.display = 'none'; document.getElementById('2411.16102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16072">arXiv:2411.16072</a> <span> [<a href="https://arxiv.org/pdf/2411.16072">pdf</a>, <a href="https://arxiv.org/format/2411.16072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Language Driven Occupancy Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhu Yu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+B">Bowen Pang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lizhe Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Runmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Q">Qihao Peng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M">Maochun Luo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sheng Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingxia Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Si-Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hui-Liang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16072v1-abstract-short" style="display: inline;"> We introduce LOcc, an effective and generalizable framework for open-vocabulary occupancy (OVO) prediction. Previous approaches typically supervise the networks through coarse voxel-to-text correspondences via image features as intermediates or noisy and sparse correspondences from voxel-based model-view projections. To alleviate the inaccurate supervision, we propose a semantic transitive labelin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16072v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16072v1-abstract-full" style="display: none;"> We introduce LOcc, an effective and generalizable framework for open-vocabulary occupancy (OVO) prediction. Previous approaches typically supervise the networks through coarse voxel-to-text correspondences via image features as intermediates or noisy and sparse correspondences from voxel-based model-view projections. To alleviate the inaccurate supervision, we propose a semantic transitive labeling pipeline to generate dense and finegrained 3D language occupancy ground truth. Our pipeline presents a feasible way to dig into the valuable semantic information of images, transferring text labels from images to LiDAR point clouds and utimately to voxels, to establish precise voxel-to-text correspondences. By replacing the original prediction head of supervised occupancy models with a geometry head for binary occupancy states and a language head for language features, LOcc effectively uses the generated language ground truth to guide the learning of 3D language volume. Through extensive experiments, we demonstrate that our semantic transitive labeling pipeline can produce more accurate pseudo-labeled ground truth, diminishing labor-intensive human annotations. Additionally, we validate LOcc across various architectures, where all models consistently outperform state-ofthe-art zero-shot occupancy prediction approaches on the Occ3D-nuScenes dataset. Notably, even based on the simpler BEVDet model, with an input resolution of 256 * 704,Occ-BEVDet achieves an mIoU of 20.29, surpassing previous approaches that rely on temporal images, higher-resolution inputs, or larger backbone networks. The code for the proposed method is available at https://github.com/pkqbajng/LOcc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16072v1-abstract-full').style.display = 'none'; document.getElementById('2411.16072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14925">arXiv:2411.14925</a> <span> [<a href="https://arxiv.org/pdf/2411.14925">pdf</a>, <a href="https://arxiv.org/format/2411.14925">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Purrfessor: A Fine-tuned Multimodal LLaVA Diet Health Chatbot </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+L">Linqi Lu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yifan Deng</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Chuan Tian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sijia Yang</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+D">Dhavan Shah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14925v1-abstract-short" style="display: inline;"> This study introduces Purrfessor, an innovative AI chatbot designed to provide personalized dietary guidance through interactive, multimodal engagement. Leveraging the Large Language-and-Vision Assistant (LLaVA) model fine-tuned with food and nutrition data and a human-in-the-loop approach, Purrfessor integrates visual meal analysis with contextual advice to enhance user experience and engagement.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14925v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14925v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14925v1-abstract-full" style="display: none;"> This study introduces Purrfessor, an innovative AI chatbot designed to provide personalized dietary guidance through interactive, multimodal engagement. Leveraging the Large Language-and-Vision Assistant (LLaVA) model fine-tuned with food and nutrition data and a human-in-the-loop approach, Purrfessor integrates visual meal analysis with contextual advice to enhance user experience and engagement. We conducted two studies to evaluate the chatbot's performance and user experience: (a) simulation assessments and human validation were conducted to examine the performance of the fine-tuned model; (b) a 2 (Profile: Bot vs. Pet) by 3 (Model: GPT-4 vs. LLaVA vs. Fine-tuned LLaVA) experiment revealed that Purrfessor significantly enhanced users' perceptions of care ($尾= 1.59$, $p = 0.04$) and interest ($尾= 2.26$, $p = 0.01$) compared to the GPT-4 bot. Additionally, user interviews highlighted the importance of interaction design details, emphasizing the need for responsiveness, personalization, and guidance to improve user engagement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14925v1-abstract-full').style.display = 'none'; document.getElementById('2411.14925v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14751">arXiv:2411.14751</a> <span> [<a href="https://arxiv.org/pdf/2411.14751">pdf</a>, <a href="https://arxiv.org/format/2411.14751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> TopoSD: Topology-Enhanced Lane Segment Perception with SDMap Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sen Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Minyue Jiang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Ziwei Fan</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiaolu Xie</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xiao Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yingying Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+E">Errui Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14751v1-abstract-short" style="display: inline;"> Recent advances in autonomous driving systems have shifted towards reducing reliance on high-definition maps (HDMaps) due to the huge costs of annotation and maintenance. Instead, researchers are focusing on online vectorized HDMap construction using on-board sensors. However, sensor-only approaches still face challenges in long-range perception due to the restricted views imposed by the mounting… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14751v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14751v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14751v1-abstract-full" style="display: none;"> Recent advances in autonomous driving systems have shifted towards reducing reliance on high-definition maps (HDMaps) due to the huge costs of annotation and maintenance. Instead, researchers are focusing on online vectorized HDMap construction using on-board sensors. However, sensor-only approaches still face challenges in long-range perception due to the restricted views imposed by the mounting angles of onboard cameras, just as human drivers also rely on bird's-eye-view navigation maps for a comprehensive understanding of road structures. To address these issues, we propose to train the perception model to "see" standard definition maps (SDMaps). We encode SDMap elements into neural spatial map representations and instance tokens, and then incorporate such complementary features as prior information to improve the bird's eye view (BEV) feature for lane geometry and topology decoding. Based on the lane segment representation framework, the model simultaneously predicts lanes, centrelines and their topology. To further enhance the ability of geometry prediction and topology reasoning, we also use a topology-guided decoder to refine the predictions by exploiting the mutual relationships between topological and geometric features. We perform extensive experiments on OpenLane-V2 datasets to validate the proposed method. The results show that our model outperforms state-of-the-art methods by a large margin, with gains of +6.7 and +9.1 on the mAP and topology metrics. Our analysis also reveals that models trained with SDMap noise augmentation exhibit enhanced robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14751v1-abstract-full').style.display = 'none'; document.getElementById('2411.14751v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 7 figures, and 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14723">arXiv:2411.14723</a> <span> [<a href="https://arxiv.org/pdf/2411.14723">pdf</a>, <a href="https://arxiv.org/format/2411.14723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Effective SAM Combination for Open-Vocabulary Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+M">Minhyeok Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+S">Suhwan Cho</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jungho Lee</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sunghun Yang</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+H">Heeseung Choi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+I">Ig-Jae Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sangyoun Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14723v1-abstract-short" style="display: inline;"> Open-vocabulary semantic segmentation aims to assign pixel-level labels to images across an unlimited range of classes. Traditional methods address this by sequentially connecting a powerful mask proposal generator, such as the Segment Anything Model (SAM), with a pre-trained vision-language model like CLIP. But these two-stage approaches often suffer from high computational costs, memory ineffici… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14723v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14723v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14723v1-abstract-full" style="display: none;"> Open-vocabulary semantic segmentation aims to assign pixel-level labels to images across an unlimited range of classes. Traditional methods address this by sequentially connecting a powerful mask proposal generator, such as the Segment Anything Model (SAM), with a pre-trained vision-language model like CLIP. But these two-stage approaches often suffer from high computational costs, memory inefficiencies. In this paper, we propose ESC-Net, a novel one-stage open-vocabulary segmentation model that leverages the SAM decoder blocks for class-agnostic segmentation within an efficient inference framework. By embedding pseudo prompts generated from image-text correlations into SAM's promptable segmentation framework, ESC-Net achieves refined spatial aggregation for accurate mask predictions. ESC-Net achieves superior performance on standard benchmarks, including ADE20K, PASCAL-VOC, and PASCAL-Context, outperforming prior methods in both efficiency and accuracy. Comprehensive ablation studies further demonstrate its robustness across challenging conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14723v1-abstract-full').style.display = 'none'; document.getElementById('2411.14723v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14720">arXiv:2411.14720</a> <span> [<a href="https://arxiv.org/pdf/2411.14720">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Social Media Annotation of HPV Vaccine Skepticism and Misinformation Using Large Language Models: An Experimental Evaluation of In-Context Learning and Fine-Tuning Stance Detection Across Multiple Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+L">Luhang Sun</a>, <a href="/search/cs?searchtype=author&query=Pendyala%2C+V">Varsha Pendyala</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+Y">Yun-Shiuan Chuang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shanglin Yang</a>, <a href="/search/cs?searchtype=author&query=Feldman%2C+J">Jonathan Feldman</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+A">Andrew Zhao</a>, <a href="/search/cs?searchtype=author&query=De+Choudhury%2C+M">Munmun De Choudhury</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sijia Yang</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+D">Dhavan Shah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14720v1-abstract-short" style="display: inline;"> This paper leverages large-language models (LLMs) to experimentally determine optimal strategies for scaling up social media content annotation for stance detection on HPV vaccine-related tweets. We examine both conventional fine-tuning and emergent in-context learning methods, systematically varying strategies of prompt engineering across widely used LLMs and their variants (e.g., GPT4, Mistral,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14720v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14720v1-abstract-full" style="display: none;"> This paper leverages large-language models (LLMs) to experimentally determine optimal strategies for scaling up social media content annotation for stance detection on HPV vaccine-related tweets. We examine both conventional fine-tuning and emergent in-context learning methods, systematically varying strategies of prompt engineering across widely used LLMs and their variants (e.g., GPT4, Mistral, and Llama3, etc.). Specifically, we varied prompt template design, shot sampling methods, and shot quantity to detect stance on HPV vaccination. Our findings reveal that 1) in general, in-context learning outperforms fine-tuning in stance detection for HPV vaccine social media content; 2) increasing shot quantity does not necessarily enhance performance across models; and 3) different LLMs and their variants present differing sensitivity to in-context learning conditions. We uncovered that the optimal in-context learning configuration for stance detection on HPV vaccine tweets involves six stratified shots paired with detailed contextual prompts. This study highlights the potential and provides an applicable approach for applying LLMs to research on social media stance and skepticism detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14720v1-abstract-full').style.display = 'none'; document.getElementById('2411.14720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12301">arXiv:2411.12301</a> <span> [<a href="https://arxiv.org/pdf/2411.12301">pdf</a>, <a href="https://arxiv.org/format/2411.12301">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Physics-Guided Detector for SAR Airplanes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhongling Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Long Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuxin Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhirui Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Gong Cheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Junwei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12301v1-abstract-short" style="display: inline;"> The disperse structure distributions (discreteness) and variant scattering characteristics (variability) of SAR airplane targets lead to special challenges of object detection and recognition. The current deep learning-based detectors encounter challenges in distinguishing fine-grained SAR airplanes against complex backgrounds. To address it, we propose a novel physics-guided detector (PGD) learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12301v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12301v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12301v1-abstract-full" style="display: none;"> The disperse structure distributions (discreteness) and variant scattering characteristics (variability) of SAR airplane targets lead to special challenges of object detection and recognition. The current deep learning-based detectors encounter challenges in distinguishing fine-grained SAR airplanes against complex backgrounds. To address it, we propose a novel physics-guided detector (PGD) learning paradigm for SAR airplanes that comprehensively investigate their discreteness and variability to improve the detection performance. It is a general learning paradigm that can be extended to different existing deep learning-based detectors with "backbone-neck-head" architectures. The main contributions of PGD include the physics-guided self-supervised learning, feature enhancement, and instance perception, denoted as PGSSL, PGFE, and PGIP, respectively. PGSSL aims to construct a self-supervised learning task based on a wide range of SAR airplane targets that encodes the prior knowledge of various discrete structure distributions into the embedded space. Then, PGFE enhances the multi-scale feature representation of a detector, guided by the physics-aware information learned from PGSSL. PGIP is constructed at the detection head to learn the refined and dominant scattering point of each SAR airplane instance, thus alleviating the interference from the complex background. We propose two implementations, denoted as PGD and PGD-Lite, and apply them to various existing detectors with different backbones and detection heads. The experiments demonstrate the flexibility and effectiveness of the proposed PGD, which can improve existing detectors on SAR airplane detection with fine-grained classification task (an improvement of 3.1\% mAP most), and achieve the state-of-the-art performance (90.7\% mAP) on SAR-AIRcraft-1.0 dataset. The project is open-source at \url{https://github.com/XAI4SAR/PGD}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12301v1-abstract-full').style.display = 'none'; document.getElementById('2411.12301v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11706">arXiv:2411.11706</a> <span> [<a href="https://arxiv.org/pdf/2411.11706">pdf</a>, <a href="https://arxiv.org/format/2411.11706">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MC-LLaVA: Multi-Concept Personalized Vision-Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+R">Ruichuan An</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sihan Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Ming Lu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+K">Kai Zeng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yulin Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiajun Cao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hao Liang</a>, <a href="/search/cs?searchtype=author&query=She%2C+Q">Qi She</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11706v1-abstract-short" style="display: inline;"> Current vision-language models (VLMs) show exceptional abilities across diverse tasks including visual question answering. To enhance user experience in practical applications, recent studies investigate VLM personalization to understand user-provided concepts. However, existing studies mainly focus on single-concept personalization, neglecting the existence and interplay of multiple concepts, whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11706v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11706v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11706v1-abstract-full" style="display: none;"> Current vision-language models (VLMs) show exceptional abilities across diverse tasks including visual question answering. To enhance user experience in practical applications, recent studies investigate VLM personalization to understand user-provided concepts. However, existing studies mainly focus on single-concept personalization, neglecting the existence and interplay of multiple concepts, which limits the real-world applicability of personalized VLMs. In this paper, we propose the first multi-concept personalization method named MC-LLaVA along with a high-quality multi-concept personalization dataset. Specifically, MC-LLaVA uses a joint training strategy incorporating multiple concepts in a single training step, allowing VLMs to perform accurately in multi-concept personalization. To reduce the cost of joint training, MC-LLaVA leverages visual token information for concept token initialization, yielding improved concept representation and accelerating joint training. To advance multi-concept personalization research, we further contribute a high-quality dataset. We carefully collect images from various movies that contain multiple characters and manually generate the multi-concept question-answer samples. Our dataset features diverse movie types and question-answer types. We conduct comprehensive qualitative and quantitative experiments to demonstrate that MC-LLaVA can achieve impressive multi-concept personalized responses, paving the way for VLMs to become better user-specific assistants. The code and dataset will be publicly available at https://github.com/arctanxarc/MC-LLaVA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11706v1-abstract-full').style.display = 'none'; document.getElementById('2411.11706v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11667">arXiv:2411.11667</a> <span> [<a href="https://arxiv.org/pdf/2411.11667">pdf</a>, <a href="https://arxiv.org/format/2411.11667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dissecting Misalignment of Multimodal Large Language Models via Influence Function </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lijie Hu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+C">Chenyang Ren</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Huanyi Xie</a>, <a href="/search/cs?searchtype=author&query=Saadi%2C+K">Khouloud Saadi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11667v1-abstract-short" style="display: inline;"> Multi-modal Large Language models (MLLMs) are always trained on data from diverse and unreliable sources, which may contain misaligned or mislabeled text-image pairs. This frequently causes robustness issues and hallucinations, leading to performance degradation. Data valuation is an efficient way to detect and trace these misalignments. Nevertheless, existing methods are computationally expensive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11667v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11667v1-abstract-full" style="display: none;"> Multi-modal Large Language models (MLLMs) are always trained on data from diverse and unreliable sources, which may contain misaligned or mislabeled text-image pairs. This frequently causes robustness issues and hallucinations, leading to performance degradation. Data valuation is an efficient way to detect and trace these misalignments. Nevertheless, existing methods are computationally expensive for MLLMs. While computationally efficient, the classical influence functions are inadequate for contrastive learning models because they were originally designed for pointwise loss. Additionally, contrastive learning involves minimizing the distance between the modalities of positive samples and maximizing the distance between the modalities of negative samples. This requires us to evaluate the influence of samples from both perspectives. To tackle these challenges, we introduce the Extended Influence Function for Contrastive Loss (ECIF), an influence function crafted for contrastive loss. ECIF considers both positive and negative samples and provides a closed-form approximation of contrastive learning models, eliminating the need for retraining. Building upon ECIF, we develop a series of algorithms for data evaluation in MLLM, misalignment detection, and misprediction trace-back tasks. Experimental results demonstrate our ECIF advances the transparency and interpretability of MLLMs by offering a more accurate assessment of data impact and model alignment compared to traditional baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11667v1-abstract-full').style.display = 'none'; document.getElementById('2411.11667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11536">arXiv:2411.11536</a> <span> [<a href="https://arxiv.org/pdf/2411.11536">pdf</a>, <a href="https://arxiv.org/format/2411.11536">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical-Graph-Structured Edge Partition Models for Learning Evolving Community Structure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xincan Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sikun Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11536v1-abstract-short" style="display: inline;"> We propose a novel dynamic network model to capture evolving latent communities within temporal networks. To achieve this, we decompose each observed dynamic edge between vertices using a Poisson-gamma edge partition model, assigning each vertex to one or more latent communities through \emph{nonnegative} vertex-community memberships. Specifically, hierarchical transition kernels are employed to m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11536v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11536v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11536v1-abstract-full" style="display: none;"> We propose a novel dynamic network model to capture evolving latent communities within temporal networks. To achieve this, we decompose each observed dynamic edge between vertices using a Poisson-gamma edge partition model, assigning each vertex to one or more latent communities through \emph{nonnegative} vertex-community memberships. Specifically, hierarchical transition kernels are employed to model the interactions between these latent communities in the observed temporal network. A hierarchical graph prior is placed on the transition structure of the latent communities, allowing us to model how they evolve and interact over time. Consequently, our dynamic network enables the inferred community structure to merge, split, and interact with one another, providing a comprehensive understanding of complex network dynamics. Experiments on various real-world network datasets demonstrate that the proposed model not only effectively uncovers interpretable latent structures but also surpasses other state-of-the art dynamic network models in the tasks of link prediction and community detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11536v1-abstract-full').style.display = 'none'; document.getElementById('2411.11536v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11479">arXiv:2411.11479</a> <span> [<a href="https://arxiv.org/pdf/2411.11479">pdf</a>, <a href="https://arxiv.org/format/2411.11479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Quantifying Preferences of Vision-Language Models via Value Decomposition in Social Media Contexts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingxuan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuning Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shengqi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yizhou Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y+N">Ying Nian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11479v1-abstract-short" style="display: inline;"> The rapid advancement of Vision-Language Models (VLMs) has expanded multimodal applications, yet evaluations often focus on basic tasks like object recognition, overlooking abstract aspects such as personalities and values. To address this gap, we introduce Value-Spectrum, a visual question-answering benchmark aimed at assessing VLMs based on Schwartz's value dimensions, which capture core values… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11479v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11479v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11479v1-abstract-full" style="display: none;"> The rapid advancement of Vision-Language Models (VLMs) has expanded multimodal applications, yet evaluations often focus on basic tasks like object recognition, overlooking abstract aspects such as personalities and values. To address this gap, we introduce Value-Spectrum, a visual question-answering benchmark aimed at assessing VLMs based on Schwartz's value dimensions, which capture core values guiding people's beliefs and actions across cultures. We constructed a vectorized database of over 50,000 short videos sourced from TikTok, YouTube Shorts, and Instagram Reels, covering multiple months and a wide array of topics such as family, health, hobbies, society, and technology. We also developed a VLM agent pipeline to automate video browsing and analysis. Benchmarking representative VLMs on Value-Spectrum reveals significant differences in their responses to value-oriented content, with most models exhibiting a preference for hedonistic topics. Beyond identifying natural preferences, we explored the ability of VLM agents to adopt specific personas when explicitly prompted, revealing insights into the models' adaptability in role-playing scenarios. These findings highlight the potential of Value-Spectrum as a comprehensive evaluation set for tracking VLM advancements in value-based tasks and for developing more sophisticated role-playing AI agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11479v1-abstract-full').style.display = 'none'; document.getElementById('2411.11479v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11289">arXiv:2411.11289</a> <span> [<a href="https://arxiv.org/pdf/2411.11289">pdf</a>, <a href="https://arxiv.org/format/2411.11289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LP Data Pipeline: Lightweight, Purpose-driven Data Pipeline for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yungi Kim</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+H">Hyunsoo Ha</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Seonghoon Yang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sukyung Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jihoo Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+C">Chanjun Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11289v1-abstract-short" style="display: inline;"> Creating high-quality, large-scale datasets for large language models (LLMs) often relies on resource-intensive, GPU-accelerated models for quality filtering, making the process time-consuming and costly. This dependence on GPUs limits accessibility for organizations lacking significant computational infrastructure. To address this issue, we introduce the Lightweight, Purpose-driven (LP) Data Pipe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11289v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11289v1-abstract-full" style="display: none;"> Creating high-quality, large-scale datasets for large language models (LLMs) often relies on resource-intensive, GPU-accelerated models for quality filtering, making the process time-consuming and costly. This dependence on GPUs limits accessibility for organizations lacking significant computational infrastructure. To address this issue, we introduce the Lightweight, Purpose-driven (LP) Data Pipeline, a framework that operates entirely on CPUs to streamline the processes of dataset extraction, filtering, and curation. Based on our four core principles, the LP Data Pipeline significantly reduces preparation time and cost while maintaining high data quality. Importantly, our pipeline enables the creation of purpose-driven datasets tailored to specific domains and languages, enhancing the applicability of LLMs in specialized contexts. We anticipate that our pipeline will lower the barriers to LLM development, enabling a wide range of organizations to access LLMs more easily. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11289v1-abstract-full').style.display = 'none'; document.getElementById('2411.11289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11276">arXiv:2411.11276</a> <span> [<a href="https://arxiv.org/pdf/2411.11276">pdf</a>, <a href="https://arxiv.org/format/2411.11276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Coupled Integral PINN for conservation law </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yeping Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shihao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11276v1-abstract-short" style="display: inline;"> The Physics-Informed Neural Network (PINN) is an innovative approach to solve a diverse array of partial differential equations (PDEs) leveraging the power of neural networks. This is achieved by minimizing the residual loss associated with the explicit physical information, usually coupled with data derived from initial and boundary conditions. However, a challenge arises in the context of nonlin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11276v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11276v1-abstract-full" style="display: none;"> The Physics-Informed Neural Network (PINN) is an innovative approach to solve a diverse array of partial differential equations (PDEs) leveraging the power of neural networks. This is achieved by minimizing the residual loss associated with the explicit physical information, usually coupled with data derived from initial and boundary conditions. However, a challenge arises in the context of nonlinear conservation laws where derivatives are undefined at shocks, leading to solutions that deviate from the true physical phenomena. To solve this issue, the physical solution must be extracted from the weak formulation of the PDE and is typically further bounded by entropy conditions. Within the numerical framework, finite volume methods (FVM) are employed to address conservation laws. These methods resolve the integral form of conservation laws and delineate the shock characteristics. Inspired by the principles underlying FVM, this paper introduces a novel Coupled Integrated PINN methodology that involves fitting the integral solutions of equations using additional neural networks. This technique not only augments the conventional PINN's capability in modeling shock waves, but also eliminates the need for spatial and temporal discretization. As such, it bypasses the complexities of numerical integration and reconstruction associated with non-convex fluxes. Finally, we show that the proposed new Integrated PINN performs well in conservative law and outperforms the vanilla PINN when tackle the challenging shock problems using examples of Burger's equation, Buckley-Leverett Equation and Euler System. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11276v1-abstract-full').style.display = 'none'; document.getElementById('2411.11276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11098">arXiv:2411.11098</a> <span> [<a href="https://arxiv.org/pdf/2411.11098">pdf</a>, <a href="https://arxiv.org/format/2411.11098">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MolParser: End-to-end Visual Recognition of Molecule Structures in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xi Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiankun Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xiaochen Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shangqian Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuwen Yang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+L">Lin Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Linfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+G">Guolin Ke</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11098v1-abstract-short" style="display: inline;"> In recent decades, chemistry publications and patents have increased rapidly. A significant portion of key information is embedded in molecular structure figures, complicating large-scale literature searches and limiting the application of large language models in fields such as biology, chemistry, and pharmaceuticals. The automatic extraction of precise chemical structures is of critical importan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11098v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11098v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11098v1-abstract-full" style="display: none;"> In recent decades, chemistry publications and patents have increased rapidly. A significant portion of key information is embedded in molecular structure figures, complicating large-scale literature searches and limiting the application of large language models in fields such as biology, chemistry, and pharmaceuticals. The automatic extraction of precise chemical structures is of critical importance. However, the presence of numerous Markush structures in real-world documents, along with variations in molecular image quality, drawing styles, and noise, significantly limits the performance of existing optical chemical structure recognition (OCSR) methods. We present MolParser, a novel end-to-end OCSR method that efficiently and accurately recognizes chemical structures from real-world documents, including difficult Markush structure. We use a extended SMILES encoding rule to annotate our training dataset. Under this rule, we build MolParser-7M, the largest annotated molecular image dataset to our knowledge. While utilizing a large amount of synthetic data, we employed active learning methods to incorporate substantial in-the-wild data, specifically samples cropped from real patents and scientific literature, into the training process. We trained an end-to-end molecular image captioning model, MolParser, using a curriculum learning approach. MolParser significantly outperforms classical and learning-based methods across most scenarios, with potential for broader downstream applications. The dataset is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11098v1-abstract-full').style.display = 'none'; document.getElementById('2411.11098v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08499">arXiv:2411.08499</a> <span> [<a href="https://arxiv.org/pdf/2411.08499">pdf</a>, <a href="https://arxiv.org/format/2411.08499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning Robust Grasping Strategy Through Tactile Sensing and Adaption Skill </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yueming Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mengde Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Songhua Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuetao Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sheng Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Miao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08499v2-abstract-short" style="display: inline;"> Robust grasping represents an essential task in robotics, necessitating tactile feedback and reactive grasping adjustments for robust grasping of objects. Previous research has extensively combined tactile sensing with grasping, primarily relying on rule-based approaches, frequently neglecting post-grasping difficulties such as external disruptions or inherent uncertainties of the object's physics… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08499v2-abstract-full').style.display = 'inline'; document.getElementById('2411.08499v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08499v2-abstract-full" style="display: none;"> Robust grasping represents an essential task in robotics, necessitating tactile feedback and reactive grasping adjustments for robust grasping of objects. Previous research has extensively combined tactile sensing with grasping, primarily relying on rule-based approaches, frequently neglecting post-grasping difficulties such as external disruptions or inherent uncertainties of the object's physics and geometry. To address these limitations, this paper introduces an human-demonstration-based adaptive grasping policy base on tactile, which aims to achieve robust gripping while resisting disturbances to maintain grasp stability. Our trained model generalizes to daily objects with seven different sizes, shapes, and textures. Experimental results demonstrate that our method performs well in dynamic and force interaction tasks and exhibits excellent generalization ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08499v2-abstract-full').style.display = 'none'; document.getElementById('2411.08499v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08033">arXiv:2411.08033</a> <span> [<a href="https://arxiv.org/pdf/2411.08033">pdf</a>, <a href="https://arxiv.org/format/2411.08033">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> GaussianAnything: Interactive Point Cloud Latent Diffusion for 3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lan%2C+Y">Yushi Lan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shangchen Zhou</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Z">Zhaoyang Lyu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+F">Fangzhou Hong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuai Yang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+B">Bo Dai</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xingang Pan</a>, <a href="/search/cs?searchtype=author&query=Loy%2C+C+C">Chen Change Loy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08033v1-abstract-short" style="display: inline;"> While 3D content generation has advanced significantly, existing methods still face challenges with input formats, latent space design, and output representations. This paper introduces a novel 3D generation framework that addresses these challenges, offering scalable, high-quality 3D generation with an interactive Point Cloud-structured Latent space. Our framework employs a Variational Autoencode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08033v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08033v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08033v1-abstract-full" style="display: none;"> While 3D content generation has advanced significantly, existing methods still face challenges with input formats, latent space design, and output representations. This paper introduces a novel 3D generation framework that addresses these challenges, offering scalable, high-quality 3D generation with an interactive Point Cloud-structured Latent space. Our framework employs a Variational Autoencoder (VAE) with multi-view posed RGB-D(epth)-N(ormal) renderings as input, using a unique latent space design that preserves 3D shape information, and incorporates a cascaded latent diffusion model for improved shape-texture disentanglement. The proposed method, GaussianAnything, supports multi-modal conditional 3D generation, allowing for point cloud, caption, and single/multi-view image inputs. Notably, the newly proposed latent space naturally enables geometry-texture disentanglement, thus allowing 3D-aware editing. Experimental results demonstrate the effectiveness of our approach on multiple datasets, outperforming existing methods in both text- and image-conditioned 3D generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08033v1-abstract-full').style.display = 'none'; document.getElementById('2411.08033v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://nirvanalan.github.io/projects/GA/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07982">arXiv:2411.07982</a> <span> [<a href="https://arxiv.org/pdf/2411.07982">pdf</a>, <a href="https://arxiv.org/format/2411.07982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Interoperability From Kieker to OpenTelemetry: Demonstrated as Export to ExplorViz </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Reichelt%2C+D+G">David Georg Reichelt</a>, <a href="/search/cs?searchtype=author&query=Hansen%2C+M">Malte Hansen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shinhyung Yang</a>, <a href="/search/cs?searchtype=author&query=Hasselbring%2C+W">Wilhelm Hasselbring</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07982v1-abstract-short" style="display: inline;"> While the observability framework Kieker has a low overhead for tracing, its results currently cannot be used in most analysis tools due to lack of interoperability of the data formats. The OpenTelemetry standard aims for standardizing observability data. In this work, we describe how to export Kieker distributed tracing data to OpenTelemetry. This is done using the pipe-and-filter framework Tee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07982v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07982v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07982v1-abstract-full" style="display: none;"> While the observability framework Kieker has a low overhead for tracing, its results currently cannot be used in most analysis tools due to lack of interoperability of the data formats. The OpenTelemetry standard aims for standardizing observability data. In this work, we describe how to export Kieker distributed tracing data to OpenTelemetry. This is done using the pipe-and-filter framework TeeTime. For TeeTime, a stage was defined that uses Kieker execution data, which can be created from most record types. We demonstrate the usability of our approach by visualizing trace data of TeaStore in the ExplorViz visualization tool. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07982v1-abstract-full').style.display = 'none'; document.getElementById('2411.07982v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.7; D.2.12 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07699">arXiv:2411.07699</a> <span> [<a href="https://arxiv.org/pdf/2411.07699">pdf</a>, <a href="https://arxiv.org/format/2411.07699">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RINO: Accurate, Robust Radar-Inertial Odometry with Non-Iterative Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuocheng Yang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yueming Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+E">Shengbo Eben Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaobing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07699v2-abstract-short" style="display: inline;"> Precise localization and mapping are critical for achieving autonomous navigation in self-driving vehicles. However, ego-motion estimation still faces significant challenges, particularly when GNSS failures occur or under extreme weather conditions (e.g., fog, rain, and snow). In recent years, scanning radar has emerged as an effective solution due to its strong penetration capabilities. Neverthel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07699v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07699v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07699v2-abstract-full" style="display: none;"> Precise localization and mapping are critical for achieving autonomous navigation in self-driving vehicles. However, ego-motion estimation still faces significant challenges, particularly when GNSS failures occur or under extreme weather conditions (e.g., fog, rain, and snow). In recent years, scanning radar has emerged as an effective solution due to its strong penetration capabilities. Nevertheless, scanning radar data inherently contains high levels of noise, necessitating hundreds to thousands of iterations of optimization to estimate a reliable transformation from the noisy data. Such iterative solving is time-consuming, unstable, and prone to failure. To address these challenges, we propose an accurate and robust Radar-Inertial Odometry system, RINO, which employs a non-iterative solving approach. Our method decouples rotation and translation estimation and applies an adaptive voting scheme for 2D rotation estimation, enhancing efficiency while ensuring consistent solving time. Additionally, the approach implements a loosely coupled system between the scanning radar and an inertial measurement unit (IMU), leveraging Error-State Kalman Filtering (ESKF). Notably, we successfully estimated the uncertainty of the pose estimation from the scanning radar, incorporating this into the filter's Maximum A Posteriori estimation, a consideration that has been previously overlooked. Validation on publicly available datasets demonstrates that RINO outperforms state-of-the-art methods and baselines in both accuracy and robustness. Our code is available at https://github.com/yangsc4063/rino. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07699v2-abstract-full').style.display = 'none'; document.getElementById('2411.07699v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07111">arXiv:2411.07111</a> <span> [<a href="https://arxiv.org/pdf/2411.07111">pdf</a>, <a href="https://arxiv.org/format/2411.07111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Building a Taiwanese Mandarin Spoken Language Model: A First Attempt </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yu-Kuan Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yi-Cheng Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H+L">Ho Lam Chung</a>, <a href="/search/cs?searchtype=author&query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wei-Ping Huang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tzu-Quan Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hsiu-Hsuan Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+E">En-Pei Hu</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+C">Chan-Jan Hsu</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+L">Liang-Hsuan Tseng</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+I">I-Hsiang Chiu</a>, <a href="/search/cs?searchtype=author&query=Sanga%2C+U">Ulin Sanga</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07111v1-abstract-short" style="display: inline;"> This technical report presents our initial attempt to build a spoken large language model (LLM) for Taiwanese Mandarin, specifically tailored to enable real-time, speech-to-speech interaction in multi-turn conversations. Our end-to-end model incorporates a decoder-only transformer architecture and aims to achieve seamless interaction while preserving the conversational flow, including full-duplex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07111v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07111v1-abstract-full" style="display: none;"> This technical report presents our initial attempt to build a spoken large language model (LLM) for Taiwanese Mandarin, specifically tailored to enable real-time, speech-to-speech interaction in multi-turn conversations. Our end-to-end model incorporates a decoder-only transformer architecture and aims to achieve seamless interaction while preserving the conversational flow, including full-duplex capabilities allowing simultaneous speaking and listening. The paper also details the training process, including data preparation with synthesized dialogues and adjustments for real-time interaction. We also developed a platform to evaluate conversational fluency and response coherence in multi-turn dialogues. We hope the release of the report can contribute to the future development of spoken LLMs in Taiwanese Mandarin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07111v1-abstract-full').style.display = 'none'; document.getElementById('2411.07111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06396">arXiv:2411.06396</a> <span> [<a href="https://arxiv.org/pdf/2411.06396">pdf</a>, <a href="https://arxiv.org/format/2411.06396">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Variance Minimization Approach to Temporal-Difference Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingguo Chen</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yu Gong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shangdong Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06396v1-abstract-short" style="display: inline;"> Fast-converging algorithms are a contemporary requirement in reinforcement learning. In the context of linear function approximation, the magnitude of the smallest eigenvalue of the key matrix is a major factor reflecting the convergence speed. Traditional value-based RL algorithms focus on minimizing errors. This paper introduces a variance minimization (VM) approach for value-based RL instead of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06396v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06396v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06396v1-abstract-full" style="display: none;"> Fast-converging algorithms are a contemporary requirement in reinforcement learning. In the context of linear function approximation, the magnitude of the smallest eigenvalue of the key matrix is a major factor reflecting the convergence speed. Traditional value-based RL algorithms focus on minimizing errors. This paper introduces a variance minimization (VM) approach for value-based RL instead of error minimization. Based on this approach, we proposed two objectives, the Variance of Bellman Error (VBE) and the Variance of Projected Bellman Error (VPBE), and derived the VMTD, VMTDC, and VMETD algorithms. We provided proofs of their convergence and optimal policy invariance of the variance minimization. Experimental studies validate the effectiveness of the proposed algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06396v1-abstract-full').style.display = 'none'; document.getElementById('2411.06396v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06191">arXiv:2411.06191</a> <span> [<a href="https://arxiv.org/pdf/2411.06191">pdf</a>, <a href="https://arxiv.org/format/2411.06191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generalizing Hyperedge Expansion for Hyper-relational Knowledge Graph Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu Yang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jingtao Ding</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Q">Quanming Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06191v1-abstract-short" style="display: inline;"> By representing knowledge in a primary triple associated with additional attribute-value qualifiers, hyper-relational knowledge graph (HKG) that generalizes triple-based knowledge graph (KG) has been attracting research attention recently. Compared with KG, HKG is enriched with the semantic qualifiers as well as the hyper-relational graph structure. However, to model HKG, existing studies mainly f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06191v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06191v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06191v1-abstract-full" style="display: none;"> By representing knowledge in a primary triple associated with additional attribute-value qualifiers, hyper-relational knowledge graph (HKG) that generalizes triple-based knowledge graph (KG) has been attracting research attention recently. Compared with KG, HKG is enriched with the semantic qualifiers as well as the hyper-relational graph structure. However, to model HKG, existing studies mainly focus on either semantic information or structural information therein, which however fail to capture both simultaneously. To tackle this issue, in this paper, we generalize the hyperedge expansion in hypergraph learning and propose an equivalent transformation for HKG modeling, referred to as TransEQ. Specifically, the equivalent transformation transforms a HKG to a KG, which considers both semantic and structural characteristics. Then an encoder-decoder framework is developed to bridge the modeling research between KG and HKG. In the encoder part, KG-based graph neural networks are leveraged for structural modeling; while in the decoder part, various HKG-based scoring functions are exploited for semantic modeling. Especially, we design the sharing embedding mechanism in the encoder-decoder framework with semantic relatedness captured. We further theoretically prove that TransEQ preserves complete information in the equivalent transformation, and also achieves full expressivity. Finally, extensive experiments on three benchmarks demonstrate the superior performance of TransEQ in terms of both effectiveness and efficiency. On the largest benchmark WikiPeople, TransEQ significantly improves the state-of-the-art models by 15\% on MRR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06191v1-abstract-full').style.display = 'none'; document.getElementById('2411.06191v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05852">arXiv:2411.05852</a> <span> [<a href="https://arxiv.org/pdf/2411.05852">pdf</a>, <a href="https://arxiv.org/format/2411.05852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> $\spadesuit$ SPADE $\spadesuit$ Split Peak Attention DEcomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wolff%2C+M">Malcolm Wolff</a>, <a href="/search/cs?searchtype=author&query=Olivares%2C+K+G">Kin G. Olivares</a>, <a href="/search/cs?searchtype=author&query=Oreshkin%2C+B">Boris Oreshkin</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+S">Sunny Ruan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sitan Yang</a>, <a href="/search/cs?searchtype=author&query=Katoch%2C+A">Abhinav Katoch</a>, <a href="/search/cs?searchtype=author&query=Ramasubramanian%2C+S">Shankar Ramasubramanian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Mahoney%2C+M+W">Michael W. Mahoney</a>, <a href="/search/cs?searchtype=author&query=Efimov%2C+D">Dmitry Efimov</a>, <a href="/search/cs?searchtype=author&query=Quenneville-B%C3%A9lair%2C+V">Vincent Quenneville-B茅lair</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05852v1-abstract-short" style="display: inline;"> Demand forecasting faces challenges induced by Peak Events (PEs) corresponding to special periods such as promotions and holidays. Peak events create significant spikes in demand followed by demand ramp down periods. Neural networks like MQCNN and MQT overreact to demand peaks by carrying over the elevated PE demand into subsequent Post-Peak-Event (PPE) periods, resulting in significantly over-bia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05852v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05852v1-abstract-full" style="display: none;"> Demand forecasting faces challenges induced by Peak Events (PEs) corresponding to special periods such as promotions and holidays. Peak events create significant spikes in demand followed by demand ramp down periods. Neural networks like MQCNN and MQT overreact to demand peaks by carrying over the elevated PE demand into subsequent Post-Peak-Event (PPE) periods, resulting in significantly over-biased forecasts. To tackle this challenge, we introduce a neural forecasting model called Split Peak Attention DEcomposition, SPADE. This model reduces the impact of PEs on subsequent forecasts by modeling forecasting as consisting of two separate tasks: one for PEs; and the other for the rest. Its architecture then uses masked convolution filters and a specialized Peak Attention module. We show SPADE's performance on a worldwide retail dataset with hundreds of millions of products. Our results reveal a reduction in PPE degradation by 4.5% and an improvement in PE accuracy by 3.9%, relative to current production models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05852v1-abstract-full').style.display = 'none'; document.getElementById('2411.05852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> In 31st Conference on Neural Information Processing In 38th Conference on Neural Information Processing Systems NIPS 2017, Time Series in the Age of Large Models Workshop, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05361">arXiv:2411.05361</a> <span> [<a href="https://arxiv.org/pdf/2411.05361">pdf</a>, <a href="https://arxiv.org/format/2411.05361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-SUPERB Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/cs?searchtype=author&query=Diwan%2C+A">Anuj Diwan</a>, <a href="/search/cs?searchtype=author&query=Shih%2C+Y">Yi-Jen Shih</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Hsiao%2C+C">Chi-Yuan Hsiao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+P">Puyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shih-Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Ritter-Gutierrez%2C+F">Fabian Ritter-Gutierrez</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+M+T">Ming To Chuang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kuan-Po Huang</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">You-Kuan Lin</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+E">Eunjung Yeo</a> , et al. (53 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05361v1-abstract-short" style="display: inline;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05361v1-abstract-full" style="display: none;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluation benchmark poses a significant challenge. We present Dynamic-SUPERB Phase-2, an open and evolving benchmark for the comprehensive evaluation of instruction-based universal speech models. Building upon the first generation, this second version incorporates 125 new tasks contributed collaboratively by the global research community, expanding the benchmark to a total of 180 tasks, making it the largest benchmark for speech and audio evaluation. While the first generation of Dynamic-SUPERB was limited to classification tasks, Dynamic-SUPERB Phase-2 broadens its evaluation capabilities by introducing a wide array of novel and diverse tasks, including regression and sequence generation, across speech, music, and environmental audio. Evaluation results indicate that none of the models performed well universally. SALMONN-13B excelled in English ASR, while WavLLM demonstrated high accuracy in emotion recognition, but current models still require further innovations to handle a broader range of tasks. We will soon open-source all task data and the evaluation pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'none'; document.getElementById('2411.05361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04568">arXiv:2411.04568</a> <span> [<a href="https://arxiv.org/pdf/2411.04568">pdf</a>, <a href="https://arxiv.org/format/2411.04568">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-Attention-based EEG State Transition Modeling for Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xinke Shen</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+R">Runmin Gan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuyi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qingzhu Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Quanying Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dan Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Sen Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04568v1-abstract-short" style="display: inline;"> Electroencephalogram (EEG)-based emotion decoding can objectively quantify people's emotional state and has broad application prospects in human-computer interaction and early detection of emotional disorders. Recently emerging deep learning architectures have significantly improved the performance of EEG emotion decoding. However, existing methods still fall short of fully capturing the complex s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04568v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04568v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04568v1-abstract-full" style="display: none;"> Electroencephalogram (EEG)-based emotion decoding can objectively quantify people's emotional state and has broad application prospects in human-computer interaction and early detection of emotional disorders. Recently emerging deep learning architectures have significantly improved the performance of EEG emotion decoding. However, existing methods still fall short of fully capturing the complex spatiotemporal dynamics of neural signals, which are crucial for representing emotion processing. This study proposes a Dynamic-Attention-based EEG State Transition (DAEST) modeling method to characterize EEG spatiotemporal dynamics. The model extracts spatiotemporal components of EEG that represent multiple parallel neural processes and estimates dynamic attention weights on these components to capture transitions in brain states. The model is optimized within a contrastive learning framework for cross-subject emotion recognition. The proposed method achieved state-of-the-art performance on three publicly available datasets: FACED, SEED, and SEED-V. It achieved 75.4% accuracy in the binary classification of positive and negative emotions and 59.3% in nine-class discrete emotion classification on the FACED dataset, 88.1% in the three-class classification of positive, negative, and neutral emotions on the SEED dataset, and 73.6% in five-class discrete emotion classification on the SEED-V dataset. The learned EEG spatiotemporal patterns and dynamic transition properties offer valuable insights into neural dynamics underlying emotion processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04568v1-abstract-full').style.display = 'none'; document.getElementById('2411.04568v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04554">arXiv:2411.04554</a> <span> [<a href="https://arxiv.org/pdf/2411.04554">pdf</a>, <a href="https://arxiv.org/format/2411.04554">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Peri-midFormer: Periodic Pyramid Transformer for Time Series Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiang Wu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+G">Gechang Yao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhixi Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuyuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04554v1-abstract-short" style="display: inline;"> Time series analysis finds wide applications in fields such as weather forecasting, anomaly detection, and behavior recognition. Previous methods attempted to model temporal variations directly using 1D time series. However, this has been quite challenging due to the discrete nature of data points in time series and the complexity of periodic variation. In terms of periodicity, taking weather and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04554v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04554v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04554v1-abstract-full" style="display: none;"> Time series analysis finds wide applications in fields such as weather forecasting, anomaly detection, and behavior recognition. Previous methods attempted to model temporal variations directly using 1D time series. However, this has been quite challenging due to the discrete nature of data points in time series and the complexity of periodic variation. In terms of periodicity, taking weather and traffic data as an example, there are multi-periodic variations such as yearly, monthly, weekly, and daily, etc. In order to break through the limitations of the previous methods, we decouple the implied complex periodic variations into inclusion and overlap relationships among different level periodic components based on the observation of the multi-periodicity therein and its inclusion relationships. This explicitly represents the naturally occurring pyramid-like properties in time series, where the top level is the original time series and lower levels consist of periodic components with gradually shorter periods, which we call the periodic pyramid. To further extract complex temporal variations, we introduce self-attention mechanism into the periodic pyramid, capturing complex periodic relationships by computing attention between periodic components based on their inclusion, overlap, and adjacency relationships. Our proposed Peri-midFormer demonstrates outstanding performance in five mainstream time series analysis tasks, including short- and long-term forecasting, imputation, classification, and anomaly detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04554v1-abstract-full').style.display = 'none'; document.getElementById('2411.04554v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04308">arXiv:2411.04308</a> <span> [<a href="https://arxiv.org/pdf/2411.04308">pdf</a>, <a href="https://arxiv.org/format/2411.04308">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Bilingual Capabilities of Language Models to Support Diverse Linguistic Practices in Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Syamkumar%2C+A">Anand Syamkumar</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+N">Nora Tseng</a>, <a href="/search/cs?searchtype=author&query=Barron%2C+K">Kaycie Barron</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shanglin Yang</a>, <a href="/search/cs?searchtype=author&query=Karumbaiah%2C+S">Shamya Karumbaiah</a>, <a href="/search/cs?searchtype=author&query=Uppal%2C+R">Rheeya Uppal</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junjie Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04308v1-abstract-short" style="display: inline;"> Large language models (LLMs) offer promise in generating educational content, providing instructor feedback, and reducing teacher workload on assessments. While prior studies have focused on studying LLM-powered learning analytics, limited research has examined how effective LLMs are in a bilingual context. In this paper, we study the effectiveness of multilingual large language models (MLLMs) acr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04308v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04308v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04308v1-abstract-full" style="display: none;"> Large language models (LLMs) offer promise in generating educational content, providing instructor feedback, and reducing teacher workload on assessments. While prior studies have focused on studying LLM-powered learning analytics, limited research has examined how effective LLMs are in a bilingual context. In this paper, we study the effectiveness of multilingual large language models (MLLMs) across monolingual (English-only, Spanish-only) and bilingual (Spanglish) student writing. We present a learning analytics use case that details LLM performance in assessing acceptable and unacceptable explanations of Science and Social Science concepts. Our findings reveal a significant bias in the grading performance of pre-trained models for bilingual writing compared to English-only and Spanish-only writing. Following this, we fine-tune open-source MLLMs including Llama 3.1 and Mistral NeMo using synthetic datasets generated in English, Spanish, and Spanglish. Our experiments indicate that the models perform significantly better for all three languages after fine-tuning with bilingual data. This study highlights the potential of enhancing MLLM effectiveness to support authentic language practices amongst bilingual learners. It also aims to illustrate the value of incorporating non-English languages into the design and implementation of language models in education. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04308v1-abstract-full').style.display = 'none'; document.getElementById('2411.04308v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02815">arXiv:2411.02815</a> <span> [<a href="https://arxiv.org/pdf/2411.02815">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Artificial Intelligence-Enhanced Couinaud Segmentation for Precision Liver Cancer Therapy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Liang Qiu</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+W">Wenhao Chi</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+X">Xiaohan Xing</a>, <a href="/search/cs?searchtype=author&query=Rajendran%2C+P">Praveenbalaji Rajendran</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingjie Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuming Jiang</a>, <a href="/search/cs?searchtype=author&query=Pastor-Serrano%2C+O">Oscar Pastor-Serrano</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sen Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiyue Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yuanfeng Ji</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Q">Qiang Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02815v1-abstract-short" style="display: inline;"> Precision therapy for liver cancer necessitates accurately delineating liver sub-regions to protect healthy tissue while targeting tumors, which is essential for reducing recurrence and improving survival rates. However, the segmentation of hepatic segments, known as Couinaud segmentation, is challenging due to indistinct sub-region boundaries and the need for extensive annotated datasets. This st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02815v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02815v1-abstract-full" style="display: none;"> Precision therapy for liver cancer necessitates accurately delineating liver sub-regions to protect healthy tissue while targeting tumors, which is essential for reducing recurrence and improving survival rates. However, the segmentation of hepatic segments, known as Couinaud segmentation, is challenging due to indistinct sub-region boundaries and the need for extensive annotated datasets. This study introduces LiverFormer, a novel Couinaud segmentation model that effectively integrates global context with low-level local features based on a 3D hybrid CNN-Transformer architecture. Additionally, a registration-based data augmentation strategy is equipped to enhance the segmentation performance with limited labeled data. Evaluated on CT images from 123 patients, LiverFormer demonstrated high accuracy and strong concordance with expert annotations across various metrics, allowing for enhanced treatment planning for surgery and radiation therapy. It has great potential to reduces complications and minimizes potential damages to surrounding tissue, leading to improved outcomes for patients undergoing complex liver cancer treatments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02815v1-abstract-full').style.display = 'none'; document.getElementById('2411.02815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02722">arXiv:2411.02722</a> <span> [<a href="https://arxiv.org/pdf/2411.02722">pdf</a>, <a href="https://arxiv.org/format/2411.02722">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Commonsense Knowledge Distillation for Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Siwen Luo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S+C">Soyeon Caren Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02722v1-abstract-short" style="display: inline;"> Existing Multimodal Large Language Models (MLLMs) and Visual Language Pretrained Models (VLPMs) have shown remarkable performances in the general Visual Question Answering (VQA). However, these models struggle with VQA questions that require external commonsense knowledge due to the challenges in generating high-quality prompts and the high computational costs of fine-tuning. In this work, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02722v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02722v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02722v1-abstract-full" style="display: none;"> Existing Multimodal Large Language Models (MLLMs) and Visual Language Pretrained Models (VLPMs) have shown remarkable performances in the general Visual Question Answering (VQA). However, these models struggle with VQA questions that require external commonsense knowledge due to the challenges in generating high-quality prompts and the high computational costs of fine-tuning. In this work, we propose a novel graph-based multimodal commonsense knowledge distillation framework that constructs a unified relational graph over commonsense knowledge, visual objects and questions through a Graph Convolutional Network (GCN) following a teacher-student environment. This proposed framework is flexible with any type of teacher and student models without further fine-tuning, and has achieved competitive performances on the ScienceQA dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02722v1-abstract-full').style.display = 'none'; document.getElementById('2411.02722v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025 (Accepted, Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02059">arXiv:2411.02059</a> <span> [<a href="https://arxiv.org/pdf/2411.02059">pdf</a>, <a href="https://arxiv.org/format/2411.02059">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> TableGPT2: A Large Multimodal Model with Tabular Data Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+A">Aofeng Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Aowen Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+C">Chao Ye</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chen Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ga Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guangcheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haobo Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haokai Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoze Li</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+H">Haoxuan Lan</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jiaming Tian</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jing Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junlin Zhou</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+K">Kaizhe Shou</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+L">Liangyu Zha</a>, <a href="/search/cs?searchtype=author&query=Long%2C+L">Lin Long</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liyao Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Pengzuo Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qingyi Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saisai Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02059v3-abstract-short" style="display: inline;"> The emergence of models like GPTs, Claude, LLaMA, and Qwen has reshaped AI applications, presenting vast new opportunities across industries. Yet, the integration of tabular data remains notably underdeveloped, despite its foundational role in numerous real-world domains. This gap is critical for three main reasons. First, database or data warehouse data integration is essential for advanced app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02059v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02059v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02059v3-abstract-full" style="display: none;"> The emergence of models like GPTs, Claude, LLaMA, and Qwen has reshaped AI applications, presenting vast new opportunities across industries. Yet, the integration of tabular data remains notably underdeveloped, despite its foundational role in numerous real-world domains. This gap is critical for three main reasons. First, database or data warehouse data integration is essential for advanced applications; second, the vast and largely untapped resource of tabular data offers immense potential for analysis; and third, the business intelligence domain specifically demands adaptable, precise solutions that many current LLMs may struggle to provide. In response, we introduce TableGPT2, a model rigorously pre-trained and fine-tuned with over 593.8K tables and 2.36M high-quality query-table-output tuples, a scale of table-related data unprecedented in prior research. This extensive training enables TableGPT2 to excel in table-centric tasks while maintaining strong general language and coding abilities. One of TableGPT2's key innovations is its novel table encoder, specifically designed to capture schema-level and cell-level information. This encoder strengthens the model's ability to handle ambiguous queries, missing column names, and irregular tables commonly encountered in real-world applications. Similar to visual language models, this pioneering approach integrates with the decoder to form a robust large multimodal model. We believe the results are compelling: over 23 benchmarking metrics, TableGPT2 achieves an average performance improvement of 35.20% in the 7B model and 49.32% in the 72B model over prior benchmark-neutral LLMs, with robust general-purpose capabilities intact. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02059v3-abstract-full').style.display = 'none'; document.getElementById('2411.02059v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01981">arXiv:2411.01981</a> <span> [<a href="https://arxiv.org/pdf/2411.01981">pdf</a>, <a href="https://arxiv.org/format/2411.01981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Typicalness-Aware Learning for Failure Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yijun Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+J">Jiequan Cui</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Z">Zhuotao Tian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Senqiao Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qingdong He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoling Wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jingyong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01981v2-abstract-short" style="display: inline;"> Deep neural networks (DNNs) often suffer from the overconfidence issue, where incorrect predictions are made with high confidence scores, hindering the applications in critical systems. In this paper, we propose a novel approach called Typicalness-Aware Learning (TAL) to address this issue and improve failure detection performance. We observe that, with the cross-entropy loss, model predictions ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01981v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01981v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01981v2-abstract-full" style="display: none;"> Deep neural networks (DNNs) often suffer from the overconfidence issue, where incorrect predictions are made with high confidence scores, hindering the applications in critical systems. In this paper, we propose a novel approach called Typicalness-Aware Learning (TAL) to address this issue and improve failure detection performance. We observe that, with the cross-entropy loss, model predictions are optimized to align with the corresponding labels via increasing logit magnitude or refining logit direction. However, regarding atypical samples, the image content and their labels may exhibit disparities. This discrepancy can lead to overfitting on atypical samples, ultimately resulting in the overconfidence issue that we aim to address. To tackle the problem, we have devised a metric that quantifies the typicalness of each sample, enabling the dynamic adjustment of the logit magnitude during the training process. By allowing atypical samples to be adequately fitted while preserving reliable logit direction, the problem of overconfidence can be mitigated. TAL has been extensively evaluated on benchmark datasets, and the results demonstrate its superiority over existing failure detection methods. Specifically, TAL achieves a more than 5% improvement on CIFAR100 in terms of the Area Under the Risk-Coverage Curve (AURC) compared to the state-of-the-art. Code is available at https://github.com/liuyijungoon/TAL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01981v2-abstract-full').style.display = 'none'; document.getElementById('2411.01981v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01102">arXiv:2411.01102</a> <span> [<a href="https://arxiv.org/pdf/2411.01102">pdf</a>, <a href="https://arxiv.org/format/2411.01102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> BinEnhance: An Enhancement Framework Based on External Environment Semantics for Binary Code Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongpan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hong Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaojie Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Siyuan Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+C">Chaopeng Dong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shouguo Yang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+K">Kangyuan Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01102v3-abstract-short" style="display: inline;"> Binary code search plays a crucial role in applications like software reuse detection. Currently, existing models are typically based on either internal code semantics or a combination of function call graphs (CG) and internal code semantics. However, these models have limitations. Internal code semantic models only consider the semantics within the function, ignoring the inter-function semantics,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01102v3-abstract-full').style.display = 'inline'; document.getElementById('2411.01102v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01102v3-abstract-full" style="display: none;"> Binary code search plays a crucial role in applications like software reuse detection. Currently, existing models are typically based on either internal code semantics or a combination of function call graphs (CG) and internal code semantics. However, these models have limitations. Internal code semantic models only consider the semantics within the function, ignoring the inter-function semantics, making it difficult to handle situations such as function inlining. The combination of CG and internal code semantics is insufficient for addressing complex real-world scenarios. To address these limitations, we propose BinEnhance, a novel framework designed to leverage the inter-function semantics to enhance the expression of internal code semantics for binary code search. Specifically, BinEnhance constructs an External Environment Semantic Graph (EESG), which establishes a stable and analogous external environment for homologous functions by using different inter-function semantic relations (e.g., call, location, data-co-use). After the construction of EESG, we utilize the embeddings generated by existing internal code semantic models to initialize nodes of EESG. Finally, we design a Semantic Enhancement Model (SEM) that uses Relational Graph Convolutional Networks (RGCNs) and a residual block to learn valuable external semantics on the EESG for generating the enhanced semantics embedding. In addition, BinEnhance utilizes data feature similarity to refine the cosine similarity of semantic embeddings. We conduct experiments under six different tasks (e.g., under function inlining scenario) and the results illustrate the performance and robustness of BinEnhance. The application of BinEnhance to HermesSim, Asm2vec, TREX, Gemini, and Asteria on two public datasets results in an improvement of Mean Average Precision (MAP) from 53.6% to 69.7%. Moreover, the efficiency increases fourfold. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01102v3-abstract-full').style.display = 'none'; document.getElementById('2411.01102v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Network and Distributed System Security (NDSS) Symposium 2025 fall cycle</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00304">arXiv:2411.00304</a> <span> [<a href="https://arxiv.org/pdf/2411.00304">pdf</a>, <a href="https://arxiv.org/format/2411.00304">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Unified Generative and Discriminative Training for Multi-modal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chow%2C+W">Wei Chow</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juncheng Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qifan Yu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+K">Kaihang Pan</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zhiqi Ge</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuai Yang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siliang Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qianru Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00304v1-abstract-short" style="display: inline;"> In recent times, Vision-Language Models (VLMs) have been trained under two predominant paradigms. Generative training has enabled Multimodal Large Language Models (MLLMs) to tackle various complex tasks, yet issues such as hallucinations and weak object discrimination persist. Discriminative training, exemplified by models like CLIP, excels in zero-shot image-text classification and retrieval, yet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00304v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00304v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00304v1-abstract-full" style="display: none;"> In recent times, Vision-Language Models (VLMs) have been trained under two predominant paradigms. Generative training has enabled Multimodal Large Language Models (MLLMs) to tackle various complex tasks, yet issues such as hallucinations and weak object discrimination persist. Discriminative training, exemplified by models like CLIP, excels in zero-shot image-text classification and retrieval, yet struggles with complex scenarios requiring fine-grained semantic differentiation. This paper addresses these challenges by proposing a unified approach that integrates the strengths of both paradigms. Considering interleaved image-text sequences as the general format of input samples, we introduce a structure-induced training strategy that imposes semantic relationships between input samples and the MLLM's hidden state. This approach enhances the MLLM's ability to capture global semantics and distinguish fine-grained semantics. By leveraging dynamic sequence alignment within the Dynamic Time Warping framework and integrating a novel kernel for fine-grained semantic differentiation, our method effectively balances generative and discriminative tasks. Extensive experiments demonstrate the effectiveness of our approach, achieving state-of-the-art results in multiple generative tasks, especially those requiring cognitive and discrimination abilities. Additionally, our method surpasses discriminative benchmarks in interleaved and fine-grained retrieval tasks. By employing a retrieval-augmented generation strategy, our approach further enhances performance in some generative tasks within one model, offering a promising direction for future research in vision-language modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00304v1-abstract-full').style.display = 'none'; document.getElementById('2411.00304v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23746">arXiv:2410.23746</a> <span> [<a href="https://arxiv.org/pdf/2410.23746">pdf</a>, <a href="https://arxiv.org/format/2410.23746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DetectRL: Benchmarking LLM-Generated Text Detection in Real-World Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junchao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+R">Runzhe Zhan</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+D+F">Derek F. Wong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyi Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yulin Yuan</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+L+S">Lidia S. Chao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23746v1-abstract-short" style="display: inline;"> Detecting text generated by large language models (LLMs) is of great recent interest. With zero-shot methods like DetectGPT, detection capabilities have reached impressive levels. However, the reliability of existing detectors in real-world applications remains underexplored. In this study, we present a new benchmark, DetectRL, highlighting that even state-of-the-art (SOTA) detection techniques st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23746v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23746v1-abstract-full" style="display: none;"> Detecting text generated by large language models (LLMs) is of great recent interest. With zero-shot methods like DetectGPT, detection capabilities have reached impressive levels. However, the reliability of existing detectors in real-world applications remains underexplored. In this study, we present a new benchmark, DetectRL, highlighting that even state-of-the-art (SOTA) detection techniques still underperformed in this task. We collected human-written datasets from domains where LLMs are particularly prone to misuse. Using popular LLMs, we generated data that better aligns with real-world applications. Unlike previous studies, we employed heuristic rules to create adversarial LLM-generated text, simulating advanced prompt usages, human revisions like word substitutions, and writing errors. Our development of DetectRL reveals the strengths and limitations of current SOTA detectors. More importantly, we analyzed the potential impact of writing styles, model types, attack methods, the text lengths, and real-world human writing factors on different types of detectors. We believe DetectRL could serve as an effective benchmark for assessing detectors in real-world scenarios, evolving with advanced attack methods, thus providing more stressful evaluation to drive the development of more efficient detectors. Data and code are publicly available at: https://github.com/NLP2CT/DetectRL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23746v1-abstract-full').style.display = 'none'; document.getElementById('2410.23746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024 Dataset & Benchmarking Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23079">arXiv:2410.23079</a> <span> [<a href="https://arxiv.org/pdf/2410.23079">pdf</a>, <a href="https://arxiv.org/format/2410.23079">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BUZZ: Beehive-structured Sparse KV Cache with Segmented Heavy Hitters for Efficient LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junqi Zhao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhijin Fang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shu Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shaohui Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shichao He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23079v1-abstract-short" style="display: inline;"> Large language models (LLMs) are essential in natural language processing but often struggle with inference speed and computational efficiency, limiting real-time deployment. The key-value (KV) cache mechanism reduces computational overhead in transformer models, but challenges in maintaining contextual understanding remain. In this paper, we propose BUZZ, a novel KV caching algorithm that leverag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23079v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23079v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23079v1-abstract-full" style="display: none;"> Large language models (LLMs) are essential in natural language processing but often struggle with inference speed and computational efficiency, limiting real-time deployment. The key-value (KV) cache mechanism reduces computational overhead in transformer models, but challenges in maintaining contextual understanding remain. In this paper, we propose BUZZ, a novel KV caching algorithm that leverages structured contextual information to minimize cache memory usage while enhancing inference speed. BUZZ employs a beehive-structured sparse cache, incorporating a sliding window to capture recent information and dynamically segmenting historical tokens into chunks to prioritize important tokens in local neighborhoods. We evaluate BUZZ on four real-world datasets: CNN/Daily Mail, XSUM, Wikitext, and 10-QA. Our results demonstrate that BUZZ (1) reduces cache memory usage by $\textbf{2.5}\times$ in LLM inference while maintaining over 99% accuracy in long-text summarization, and (2) surpasses state-of-the-art performance in multi-document question answering by $\textbf{7.69%}$ under the same memory limit, where full cache methods encounter out-of-memory issues. Additionally, BUZZ achieves significant inference speedup with a $\log{n}$ time complexity. The code is available at https://github.com/JunqiZhao888/buzz-llm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23079v1-abstract-full').style.display = 'none'; document.getElementById('2410.23079v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21807">arXiv:2410.21807</a> <span> [<a href="https://arxiv.org/pdf/2410.21807">pdf</a>, <a href="https://arxiv.org/format/2410.21807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Fresh Look at Generalized Category Discovery through Non-negative Matrix Factorization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Zhong Ji</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingren Liu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yanwei Pang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jungong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21807v2-abstract-short" style="display: inline;"> Generalized Category Discovery (GCD) aims to classify both base and novel images using labeled base data. However, current approaches inadequately address the intrinsic optimization of the co-occurrence matrix $\bar{A}$ based on cosine similarity, failing to achieve zero base-novel regions and adequate sparsity in base and novel domains. To address these deficiencies, we propose a Non-Negative Gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21807v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21807v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21807v2-abstract-full" style="display: none;"> Generalized Category Discovery (GCD) aims to classify both base and novel images using labeled base data. However, current approaches inadequately address the intrinsic optimization of the co-occurrence matrix $\bar{A}$ based on cosine similarity, failing to achieve zero base-novel regions and adequate sparsity in base and novel domains. To address these deficiencies, we propose a Non-Negative Generalized Category Discovery (NN-GCD) framework. It employs Symmetric Non-negative Matrix Factorization (SNMF) as a mathematical medium to prove the equivalence of optimal K-means with optimal SNMF, and the equivalence of SNMF solver with non-negative contrastive learning (NCL) optimization. Utilizing these theoretical equivalences, it reframes the optimization of $\bar{A}$ and K-means clustering as an NCL optimization problem. Moreover, to satisfy the non-negative constraints and make a GCD model converge to a near-optimal region, we propose a GELU activation function and an NMF NCE loss. To transition $\bar{A}$ from a suboptimal state to the desired $\bar{A}^*$, we introduce a hybrid sparse regularization approach to impose sparsity constraints. Experimental results show NN-GCD outperforms state-of-the-art methods on GCD benchmarks, achieving an average accuracy of 66.1\% on the Semantic Shift Benchmark, surpassing prior counterparts by 4.7\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21807v2-abstract-full').style.display = 'none'; document.getElementById('2410.21807v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20790">arXiv:2410.20790</a> <span> [<a href="https://arxiv.org/pdf/2410.20790">pdf</a>, <a href="https://arxiv.org/format/2410.20790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SparseTem: Boosting the Efficiency of CNN-Based Video Encoders by Exploiting Temporal Continuity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kunyun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jieru Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+W">Wenchao Ding</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Minyi Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20790v1-abstract-short" style="display: inline;"> Deep learning models have become pivotal in the field of video processing and is increasingly critical in practical applications such as autonomous driving and object detection. Although Vision Transformers (ViTs) have demonstrated their power, Convolutional Neural Networks (CNNs) remain a highly efficient and high-performance choice for feature extraction and encoding. However, the intensive comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20790v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20790v1-abstract-full" style="display: none;"> Deep learning models have become pivotal in the field of video processing and is increasingly critical in practical applications such as autonomous driving and object detection. Although Vision Transformers (ViTs) have demonstrated their power, Convolutional Neural Networks (CNNs) remain a highly efficient and high-performance choice for feature extraction and encoding. However, the intensive computational demands of convolution operations hinder its broader adoption as a video encoder. Given the inherent temporal continuity in video frames, changes between consecutive frames are minimal, allowing for the skipping of redundant computations. This technique, which we term as Diff Computation, presents two primary challenges. First, Diff Computation requires to cache intermediate feature maps to ensure the correctness of non-linear computations, leading to significant memory consumption. Second, the imbalance of sparsity among layers, introduced by Diff Computation, incurs accuracy degradation. To address these issues, we propose a memory-efficient scheduling method to eliminate memory overhead and an online adjustment mechanism to minimize accuracy degradation. We integrate these techniques into our framework, SparseTem, to seamlessly support various CNN-based video encoders. SparseTem achieves speedup of 1.79x for EfficientDet and 4.72x for CRNN, with minimal accuracy drop and no additional memory overhead. Extensive experimental results demonstrate that SparseTem sets a new state-of-the-art by effectively utilizing temporal continuity to accelerate CNN-based video encoders. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20790v1-abstract-full').style.display = 'none'; document.getElementById('2410.20790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20706">arXiv:2410.20706</a> <span> [<a href="https://arxiv.org/pdf/2410.20706">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Super Resolution Based on Deep Operator Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siyuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20706v1-abstract-short" style="display: inline;"> We use Deep Operator Networks (DeepONets) to perform super-resolution reconstruction of the solutions of two types of partial differential equations and compare the model predictions with the results obtained using conventional interpolation methods to verify the advantages of DeepONets. We employ two pooling methods to downsample the origin data and conduct super-resolution reconstruction under t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20706v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20706v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20706v1-abstract-full" style="display: none;"> We use Deep Operator Networks (DeepONets) to perform super-resolution reconstruction of the solutions of two types of partial differential equations and compare the model predictions with the results obtained using conventional interpolation methods to verify the advantages of DeepONets. We employ two pooling methods to downsample the origin data and conduct super-resolution reconstruction under three different resolutions of input images. The results show that the DeepONet model can predict high-frequency oscillations and small-scale structures from low-resolution inputs very well. For the two-dimensional problem, we introduce convolutional layers to extract information from input images at a lower cost than purer MLPs. We adjust the size of the training set and observe the variation of prediction errors. In both one-dimensional and two-dimensional cases, the super-resolution reconstruction using the DeepONet model demonstrates much more accurate prediction results than cubic spline interpolation, highlighting the superiority of operator learning methods in handling such problems compared to traditional interpolation techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20706v1-abstract-full').style.display = 'none'; document.getElementById('2410.20706v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20638">arXiv:2410.20638</a> <span> [<a href="https://arxiv.org/pdf/2410.20638">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Ant Detective: An Automated Approach for Counting Ants in Densely Populated Images and Gaining Insight into Ant Foraging Behavior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Das%2C+M">Mautushi Das</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F+C">Fang-Ling Chloe Liu</a>, <a href="/search/cs?searchtype=author&query=Hartle%2C+C">Charly Hartle</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C+S">Chin-Cheng Scotty Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C+P+J">C. P. James Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20638v1-abstract-short" style="display: inline;"> Ant foraging behavior is essential to understanding ecological dynamics and developing effective pest management strategies, but quantifying this behavior is challenging due to the labor-intensive nature of manual counting, especially in densely populated images. This study presents an automated approach using computer vision to count ants and analyze their foraging behavior. Leveraging the YOLOv8… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20638v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20638v1-abstract-full" style="display: none;"> Ant foraging behavior is essential to understanding ecological dynamics and developing effective pest management strategies, but quantifying this behavior is challenging due to the labor-intensive nature of manual counting, especially in densely populated images. This study presents an automated approach using computer vision to count ants and analyze their foraging behavior. Leveraging the YOLOv8 model, the system was calibrated and evaluated on datasets encompassing various imaging scenarios and densities. The study results demonstrate that the system achieves average precision and recall of up to 87.96% and 87,78%, respectively, with only 64 calibration images provided when the both calibration and evaluation images share similar imaging backgrounds. When the background is more complex than the calibration images, the system requires a larger calibration set to generalize effectively, with 1,024 images yielding the precision and recall of up to 83.60% and 78.88, respectively. In more challenging scenarios where more than one thousand ants are present in a single image, the system significantly improves detection accuracy by slicing images into smaller patches, reaching a precision and recall of 77.97% and 71.36%, respectively. The system's ability to generate heatmaps visualizes the spatial distribution of ant activity over time, providing valuable insights into their foraging patterns. This spatial-temporal analysis enables a more comprehensive understanding of ant behavior, which is crucial for ecological studies and improving pest control methods. By automating the counting process and offering detailed behavioral analysis, this study provides an efficient tool for researchers and pest control professionals to develop more effective strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20638v1-abstract-full').style.display = 'none'; document.getElementById('2410.20638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19878">arXiv:2410.19878</a> <span> [<a href="https://arxiv.org/pdf/2410.19878">pdf</a>, <a href="https://arxiv.org/format/2410.19878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Parameter-Efficient Fine-Tuning in Large Models: A Survey of Methodologies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luping Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sheng Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Linnan Jiang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Shu Pan</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+R">Runze Cai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sen Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19878v2-abstract-short" style="display: inline;"> The large models, as predicted by scaling raw forecasts, have made groundbreaking progress in many fields, particularly in natural language generation tasks, where they have approached or even surpassed human levels. However, the unprecedented scale of their parameters brings significant computational and storage costs. These large models require substantial computational resources and GPU memory… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19878v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19878v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19878v2-abstract-full" style="display: none;"> The large models, as predicted by scaling raw forecasts, have made groundbreaking progress in many fields, particularly in natural language generation tasks, where they have approached or even surpassed human levels. However, the unprecedented scale of their parameters brings significant computational and storage costs. These large models require substantial computational resources and GPU memory to operate. When adapting large models to specific downstream tasks, their massive parameter scale poses a significant challenge in fine-tuning on hardware platforms with limited computational power and GPU memory. To address this issue, Parameter-Efficient Fine-Tuning (PEFT) offers a practical solution by efficiently adjusting the parameters of large pre-trained models to suit various downstream tasks. Specifically, PEFT adjusts the parameters of pre-trained large models to adapt to specific tasks or domains, minimizing the introduction of additional parameters and the computational resources required. This review mainly introduces the preliminary knowledge of PEFT, the core ideas and principles of various PEFT algorithms, the applications of PEFT, and potential future research directions. By reading this review, we believe that interested parties can quickly grasp the PEFT methodology, thereby accelerating its development and innovation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19878v2-abstract-full').style.display = 'none'; document.getElementById('2410.19878v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19737">arXiv:2410.19737</a> <span> [<a href="https://arxiv.org/pdf/2410.19737">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> High-Precision Real-Time Pores Detection in LPBF using Thermal Energy Density (TED) Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+C">Chuxiao Meng</a>, <a href="/search/cs?searchtype=author&query=Porter%2C+C">Conor Porter</a>, <a href="/search/cs?searchtype=author&query=Malakpour%2C+S">Sina Malakpour</a>, <a href="/search/cs?searchtype=author&query=Mathesen%2C+G">Garrett Mathesen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Seongyeon Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19737v1-abstract-short" style="display: inline;"> Pore formation during Laser Powder Bed Fusion (LPBF) has long posed challenges in metal 3D printing, significantly affecting the mechanical properties of the final product. Porosity frequently occurs because of an unstable keyhole formation, triggered by an excess laser energy. Traditional approaches for detecting pores rely heavily on CT scanning, a time-consuming and costly method unsuitable for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19737v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19737v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19737v1-abstract-full" style="display: none;"> Pore formation during Laser Powder Bed Fusion (LPBF) has long posed challenges in metal 3D printing, significantly affecting the mechanical properties of the final product. Porosity frequently occurs because of an unstable keyhole formation, triggered by an excess laser energy. Traditional approaches for detecting pores rely heavily on CT scanning, a time-consuming and costly method unsuitable for large-scale production. In response to these limitations, we have developed a real-time pore detection method using thermal sensor data, offering a more efficient, cost-effective alternative for quality control during the LPBF process. Our method, validated against CT-scanned pore counts, provides a high degree of accuracy, achieving an R^2 value of 0.94 between the across eight sample prints. This approach also effectively tracks pore formation trends as the layer-wise printing pattern changes, providing timely insights into product quality, which may serve as important datapoints for real-time adaptive parameters optimization in the future. In contrast to prior machine learning-based techniques, which were limited by high computational costs and lacked direct validation strategy, the method intr <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19737v1-abstract-full').style.display = 'none'; document.getElementById('2410.19737v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">No comments so dar</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> no </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository