CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 132 results for author: <span class="mathjax">Ge, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Ge%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ge, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ge%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ge, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ge%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ge%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ge%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Ge%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13663">arXiv:2502.13663</a> <span> [<a href="https://arxiv.org/pdf/2502.13663">pdf</a>, <a href="https://arxiv.org/ps/2502.13663">ps</a>, <a href="https://arxiv.org/format/2502.13663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> User Association and Coordinated Beamforming in Cognitive Aerial-Terrestrial Networks: A Safe Reinforcement Learning Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zizhen Zhou</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jungang Ge</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Ying-Chang Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13663v1-abstract-short" style="display: inline;"> Cognitive aerial-terrestrial networks (CATNs) offer a solution to spectrum scarcity by sharing spectrum between aerial and terrestrial networks. However, aerial users (AUs) experience significant interference from numerous terrestrial base stations (BSs). To alleviate such interference, we investigate a user association and coordinated beamforming (CBF) problem in CATN, where the aerial network se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13663v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13663v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13663v1-abstract-full" style="display: none;"> Cognitive aerial-terrestrial networks (CATNs) offer a solution to spectrum scarcity by sharing spectrum between aerial and terrestrial networks. However, aerial users (AUs) experience significant interference from numerous terrestrial base stations (BSs). To alleviate such interference, we investigate a user association and coordinated beamforming (CBF) problem in CATN, where the aerial network serves as the primary network sharing its spectrum with the terrestrial network. Specifically, we maximize the sum rate of the secondary terrestrial users (TUs) under the interference temperature constraints of the AUs. Traditional iterative optimization schemes are impractical due to their high computational complexity and information exchange overhead. Although deep reinforcement learning (DRL) based schemes can address these challenges, their performance is sensitive to the weights of the weighted penalty terms for violating constraints in the reward function. Motivated by these issues, we propose a safe DRL-based user association and CBF scheme for CATN, eliminating the need for training multiple times to find the optimal penalty weight before actual deployment. Specifically, the CATN is modeled as a networked constrained partially observable Markov game. Each TU acts as an agent to choose its associated BS, and each BS acts as an agent to decide its beamforming vectors, aiming to maximize the reward while satisfying the safety constraints introduced by the interference constraints of the AUs. By exploiting a safe DRL algorithm, the proposed scheme incurs lower deployment expenses than the penalty-based DRL schemes since only one training is required before actual deployment. Simulation results show that the proposed scheme can achieve a higher sum rate of TUs than a two-stage optimization scheme while the average received interference power of the AUs is generally below the threshold. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13663v1-abstract-full').style.display = 'none'; document.getElementById('2502.13663v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06453">arXiv:2502.06453</a> <span> [<a href="https://arxiv.org/pdf/2502.06453">pdf</a>, <a href="https://arxiv.org/format/2502.06453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MATH-Perturb: Benchmarking LLMs' Math Reasoning Abilities against Hard Perturbations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaixuan Huang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiacheng Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zihao Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiang Ji</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiawei Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenzhe Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yingqing Guo</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tianle Cai</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hui Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runzhe Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yue Wu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Ming Yin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shange Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangsibo Huang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengdi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06453v2-abstract-short" style="display: inline;"> Large language models have demonstrated impressive performance on challenging mathematical reasoning tasks, which has triggered the discussion of whether the performance is achieved by true reasoning capability or memorization. To investigate this question, prior work has constructed mathematical benchmarks when questions undergo simple perturbations -- modifications that still preserve the underl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06453v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06453v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06453v2-abstract-full" style="display: none;"> Large language models have demonstrated impressive performance on challenging mathematical reasoning tasks, which has triggered the discussion of whether the performance is achieved by true reasoning capability or memorization. To investigate this question, prior work has constructed mathematical benchmarks when questions undergo simple perturbations -- modifications that still preserve the underlying reasoning patterns of the solutions. However, no work has explored hard perturbations, which fundamentally change the nature of the problem so that the original solution steps do not apply. To bridge the gap, we construct MATH-P-Simple and MATH-P-Hard via simple perturbation and hard perturbation, respectively. Each consists of 279 perturbed math problems derived from level-5 (hardest) problems in the MATH dataset (Hendrycksmath et. al., 2021). We observe significant performance drops on MATH-P-Hard across various models, including o1-mini (-16.49%) and gemini-2.0-flash-thinking (-12.9%). We also raise concerns about a novel form of memorization where models blindly apply learned problem-solving skills without assessing their applicability to modified contexts. This issue is amplified when using original problems for in-context learning. We call for research efforts to address this challenge, which is critical for developing more robust and reliable reasoning models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06453v2-abstract-full').style.display = 'none'; document.getElementById('2502.06453v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v2: fix bugs in Fig. 1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13411">arXiv:2501.13411</a> <span> [<a href="https://arxiv.org/pdf/2501.13411">pdf</a>, <a href="https://arxiv.org/format/2501.13411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> VulnBot: Autonomous Penetration Testing for A Multi-Agent Collaborative Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+H">He Kong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Die Hu</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jingguo Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liangxiong Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tong Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bingzhen Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13411v1-abstract-short" style="display: inline;"> Penetration testing is a vital practice for identifying and mitigating vulnerabilities in cybersecurity systems, but its manual execution is labor-intensive and time-consuming. Existing large language model (LLM)-assisted or automated penetration testing approaches often suffer from inefficiencies, such as a lack of contextual understanding and excessive, unstructured data generation. This paper p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13411v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13411v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13411v1-abstract-full" style="display: none;"> Penetration testing is a vital practice for identifying and mitigating vulnerabilities in cybersecurity systems, but its manual execution is labor-intensive and time-consuming. Existing large language model (LLM)-assisted or automated penetration testing approaches often suffer from inefficiencies, such as a lack of contextual understanding and excessive, unstructured data generation. This paper presents VulnBot, an automated penetration testing framework that leverages LLMs to simulate the collaborative workflow of human penetration testing teams through a multi-agent system. To address the inefficiencies and reliance on manual intervention in traditional penetration testing methods, VulnBot decomposes complex tasks into three specialized phases: reconnaissance, scanning, and exploitation. These phases are guided by a penetration task graph (PTG) to ensure logical task execution. Key design features include role specialization, penetration path planning, inter-agent communication, and generative penetration behavior. Experimental results demonstrate that VulnBot outperforms baseline models such as GPT-4 and Llama3 in automated penetration testing tasks, particularly showcasing its potential in fully autonomous testing on real-world machines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13411v1-abstract-full').style.display = 'none'; document.getElementById('2501.13411v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09980">arXiv:2501.09980</a> <span> [<a href="https://arxiv.org/pdf/2501.09980">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Aneumo: A Large-Scale Comprehensive Synthetic Dataset of Aneurysm Hemodynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xigui Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuanye Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+F">Feiyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xin Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chen Jiang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jianchao Ge</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiansheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qimeng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Taiwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chensen Lin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuan Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09980v1-abstract-short" style="display: inline;"> Intracranial aneurysm (IA) is a common cerebrovascular disease that is usually asymptomatic but may cause severe subarachnoid hemorrhage (SAH) if ruptured. Although clinical practice is usually based on individual factors and morphological features of the aneurysm, its pathophysiology and hemodynamic mechanisms remain controversial. To address the limitations of current research, this study constr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09980v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09980v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09980v1-abstract-full" style="display: none;"> Intracranial aneurysm (IA) is a common cerebrovascular disease that is usually asymptomatic but may cause severe subarachnoid hemorrhage (SAH) if ruptured. Although clinical practice is usually based on individual factors and morphological features of the aneurysm, its pathophysiology and hemodynamic mechanisms remain controversial. To address the limitations of current research, this study constructed a comprehensive hemodynamic dataset of intracranial aneurysms. The dataset is based on 466 real aneurysm models, and 10,000 synthetic models were generated by resection and deformation operations, including 466 aneurysm-free models and 9,534 deformed aneurysm models. The dataset also provides medical image-like segmentation mask files to support insightful analysis. In addition, the dataset contains hemodynamic data measured at eight steady-state flow rates (0.001 to 0.004 kg/s), including critical parameters such as flow velocity, pressure, and wall shear stress, providing a valuable resource for investigating aneurysm pathogenesis and clinical prediction. This dataset will help advance the understanding of the pathologic features and hemodynamic mechanisms of intracranial aneurysms and support in-depth research in related fields. Dataset hosted at https://github.com/Xigui-Li/Aneumo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09980v1-abstract-full').style.display = 'none'; document.getElementById('2501.09980v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07783">arXiv:2501.07783</a> <span> [<a href="https://arxiv.org/pdf/2501.07783">pdf</a>, <a href="https://arxiv.org/format/2501.07783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Parameter-Inverted Image Pyramid Networks for Visual Perception and Multimodal Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaokai Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xizhou Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xue Yang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+G">Gen Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Changyao Tian</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+W">Wenhan Dou</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Junqi Ge</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lewei Lu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07783v1-abstract-short" style="display: inline;"> Image pyramids are widely adopted in top-performing methods to obtain multi-scale features for precise visual perception and understanding. However, current image pyramids use the same large-scale model to process multiple resolutions of images, leading to significant computational cost. To address this challenge, we propose a novel network architecture, called Parameter-Inverted Image Pyramid Net… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07783v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07783v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07783v1-abstract-full" style="display: none;"> Image pyramids are widely adopted in top-performing methods to obtain multi-scale features for precise visual perception and understanding. However, current image pyramids use the same large-scale model to process multiple resolutions of images, leading to significant computational cost. To address this challenge, we propose a novel network architecture, called Parameter-Inverted Image Pyramid Networks (PIIP). Specifically, PIIP uses pretrained models (ViTs or CNNs) as branches to process multi-scale images, where images of higher resolutions are processed by smaller network branches to balance computational cost and performance. To integrate information from different spatial scales, we further propose a novel cross-branch feature interaction mechanism. To validate PIIP, we apply it to various perception models and a representative multimodal large language model called LLaVA, and conduct extensive experiments on various tasks such as object detection, segmentation, image classification and multimodal understanding. PIIP achieves superior performance compared to single-branch and existing multi-resolution approaches with lower computational cost. When applied to InternViT-6B, a large-scale vision foundation model, PIIP can improve its performance by 1%-2% on detection and segmentation with only 40%-60% of the original computation, finally achieving 60.0 box AP on MS COCO and 59.7 mIoU on ADE20K. For multimodal understanding, our PIIP-LLaVA achieves 73.0% accuracy on TextVQA and 74.5% on MMBench with only 2.8M training data. Our code is released at https://github.com/OpenGVLab/PIIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07783v1-abstract-full').style.display = 'none'; document.getElementById('2501.07783v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06719">arXiv:2501.06719</a> <span> [<a href="https://arxiv.org/pdf/2501.06719">pdf</a>, <a href="https://arxiv.org/format/2501.06719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Sampling-based Planner with LTL Constraints and Text Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jingzhan Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zi-Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Sheng-En Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06719v1-abstract-short" style="display: inline;"> This project introduces a hierarchical planner integrating Linear Temporal Logic (LTL) constraints with natural language prompting for robot motion planning. The framework decomposes maps into regions, generates directed graphs, and converts them into transition systems for high-level planning. Text instructions are translated into LTL formulas and converted to Deterministic Finite Automata (DFA)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06719v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06719v1-abstract-full" style="display: none;"> This project introduces a hierarchical planner integrating Linear Temporal Logic (LTL) constraints with natural language prompting for robot motion planning. The framework decomposes maps into regions, generates directed graphs, and converts them into transition systems for high-level planning. Text instructions are translated into LTL formulas and converted to Deterministic Finite Automata (DFA) for sequential goal-reaching tasks while adhering to safety constraints. High-level plans, derived via Breadth-First Search (BFS), guide low-level planners like Exploring Random Trees (RRT) and Probabilistic Roadmaps (PRM) for obstacle-avoidant navigation along with LTL tasks. The approach demonstrates adaptability to various task complexities, though challenges such as graph construction overhead and suboptimal path generation remain. Future directions include extending to considering terrain conditions and incorporating higher-order dynamics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06719v1-abstract-full').style.display = 'none'; document.getElementById('2501.06719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 17 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00912">arXiv:2501.00912</a> <span> [<a href="https://arxiv.org/pdf/2501.00912">pdf</a>, <a href="https://arxiv.org/format/2501.00912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AutoPresent: Designing Structured Visuals from Scratch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiaxin Ge</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z+Z">Zora Zhiruo Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xuhui Zhou</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yi-Hao Peng</a>, <a href="/search/cs?searchtype=author&query=Subramanian%2C+S">Sanjay Subramanian</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Q">Qinyue Tan</a>, <a href="/search/cs?searchtype=author&query=Sap%2C+M">Maarten Sap</a>, <a href="/search/cs?searchtype=author&query=Suhr%2C+A">Alane Suhr</a>, <a href="/search/cs?searchtype=author&query=Fried%2C+D">Daniel Fried</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&query=Darrell%2C+T">Trevor Darrell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00912v1-abstract-short" style="display: inline;"> Designing structured visuals such as presentation slides is essential for communicative needs, necessitating both content creation and visual planning skills. In this work, we tackle the challenge of automated slide generation, where models produce slide presentations from natural language (NL) instructions. We first introduce the SlidesBench benchmark, the first benchmark for slide generation wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00912v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00912v1-abstract-full" style="display: none;"> Designing structured visuals such as presentation slides is essential for communicative needs, necessitating both content creation and visual planning skills. In this work, we tackle the challenge of automated slide generation, where models produce slide presentations from natural language (NL) instructions. We first introduce the SlidesBench benchmark, the first benchmark for slide generation with 7k training and 585 testing examples derived from 310 slide decks across 10 domains. SlidesBench supports evaluations that are (i)reference-based to measure similarity to a target slide, and (ii)reference-free to measure the design quality of generated slides alone. We benchmark end-to-end image generation and program generation methods with a variety of models, and find that programmatic methods produce higher-quality slides in user-interactable formats. Built on the success of program generation, we create AutoPresent, an 8B Llama-based model trained on 7k pairs of instructions paired with code for slide generation, and achieve results comparable to the closed-source model GPT-4o. We further explore iterative design refinement where the model is tasked to self-refine its own output, and we found that this process improves the slide's quality. We hope that our work will provide a basis for future work on generating structured visuals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00912v1-abstract-full').style.display = 'none'; document.getElementById('2501.00912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09799">arXiv:2412.09799</a> <span> [<a href="https://arxiv.org/pdf/2412.09799">pdf</a>, <a href="https://arxiv.org/format/2412.09799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CP-DETR: Concept Prompt Guide DETR Toward Stronger Universal Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qibo Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+W">Weizhong Jin</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jianyue Ge</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengdi Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuchao Yan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jian Jiang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Li Yu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xuanjiang Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuchang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianzhong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09799v1-abstract-short" style="display: inline;"> Recent research on universal object detection aims to introduce language in a SoTA closed-set detector and then generalize the open-set concepts by constructing large-scale (text-region) datasets for training. However, these methods face two main challenges: (i) how to efficiently use the prior information in the prompts to genericise objects and (ii) how to reduce alignment bias in the downstream… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09799v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09799v1-abstract-full" style="display: none;"> Recent research on universal object detection aims to introduce language in a SoTA closed-set detector and then generalize the open-set concepts by constructing large-scale (text-region) datasets for training. However, these methods face two main challenges: (i) how to efficiently use the prior information in the prompts to genericise objects and (ii) how to reduce alignment bias in the downstream tasks, both leading to sub-optimal performance in some scenarios beyond pre-training. To address these challenges, we propose a strong universal detection foundation model called CP-DETR, which is competitive in almost all scenarios, with only one pre-training weight. Specifically, we design an efficient prompt visual hybrid encoder that enhances the information interaction between prompt and visual through scale-by-scale and multi-scale fusion modules. Then, the hybrid encoder is facilitated to fully utilize the prompted information by prompt multi-label loss and auxiliary detection head. In addition to text prompts, we have designed two practical concept prompt generation methods, visual prompt and optimized prompt, to extract abstract concepts through concrete visual examples and stably reduce alignment bias in downstream tasks. With these effective designs, CP-DETR demonstrates superior universal detection performance in a broad spectrum of scenarios. For example, our Swin-T backbone model achieves 47.6 zero-shot AP on LVIS, and the Swin-L backbone model achieves 32.2 zero-shot AP on ODinW35. Furthermore, our visual prompt generation method achieves 68.4 AP on COCO val by interactive detection, and the optimized prompt achieves 73.1 fully-shot AP on ODinW13. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09799v1-abstract-full').style.display = 'none'; document.getElementById('2412.09799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09616">arXiv:2412.09616</a> <span> [<a href="https://arxiv.org/pdf/2412.09616">pdf</a>, <a href="https://arxiv.org/format/2412.09616">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> V2PE: Improving Multimodal Long-Context Capability of Vision-Language Models with Variable Visual Position Encoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Junqi Ge</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyi Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jintao Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinguo Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xihui Liu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xizhou Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09616v2-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) have shown promising capabilities in handling various multimodal tasks, yet they struggle in long-context scenarios, particularly in tasks involving videos, high-resolution images, or lengthy image-text documents. In our work, we first conduct an empirical analysis of the long-context capabilities of VLMs using our augmented long-context multimodal datasets. Our findi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09616v2-abstract-full').style.display = 'inline'; document.getElementById('2412.09616v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09616v2-abstract-full" style="display: none;"> Vision-Language Models (VLMs) have shown promising capabilities in handling various multimodal tasks, yet they struggle in long-context scenarios, particularly in tasks involving videos, high-resolution images, or lengthy image-text documents. In our work, we first conduct an empirical analysis of the long-context capabilities of VLMs using our augmented long-context multimodal datasets. Our findings reveal that directly applying the positional encoding mechanism used for textual tokens to visual tokens is suboptimal, and VLM performance degrades sharply when the position encoding exceeds the model's context window. To address this, we propose Variable Visual Position Encoding (V2PE), a novel positional encoding approach that employs variable and smaller increments for visual tokens, enabling more efficient management of long multimodal sequences. Our experiments demonstrate the effectiveness of V2PE to enhances VLMs' ability to effectively understand and reason over long multimodal contexts. We further integrate V2PE with our augmented long-context multimodal datasets to fine-tune the open-source VLM, InternVL2. The fine-tuned model achieves strong performance on both standard and long-context multimodal tasks. Notably, when the sequence length of the training dataset is increased to 256K tokens, the model is capable of processing multimodal sequences up to 1M tokens, highlighting its potential for real-world long-context applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09616v2-abstract-full').style.display = 'none'; document.getElementById('2412.09616v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code and models will be available at https://github.com/OpenGVLab/V2PE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09596">arXiv:2412.09596</a> <span> [<a href="https://arxiv.org/pdf/2412.09596">pdf</a>, <a href="https://arxiv.org/format/2412.09596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> InternLM-XComposer2.5-OmniLive: A Comprehensive Multimodal System for Long-term Streaming Video and Audio Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pan Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaoyi Dong</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuhang Cao</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+Y">Yuhang Zang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+R">Rui Qian</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xilin Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifei Li</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Junbo Niu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shuangrui Ding</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qipeng Guo</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+H">Haodong Duan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xin Chen</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+H">Han Lv</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Z">Zheng Nie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiaye Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingwen Li</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhongying Tu</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingcheng Zhang</a> , et al. (4 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09596v1-abstract-short" style="display: inline;"> Creating AI systems that can interact with environments over long periods, similar to human cognition, has been a longstanding research goal. Recent advancements in multimodal large language models (MLLMs) have made significant strides in open-world understanding. However, the challenge of continuous and simultaneous streaming perception, memory, and reasoning remains largely unexplored. Current M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09596v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09596v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09596v1-abstract-full" style="display: none;"> Creating AI systems that can interact with environments over long periods, similar to human cognition, has been a longstanding research goal. Recent advancements in multimodal large language models (MLLMs) have made significant strides in open-world understanding. However, the challenge of continuous and simultaneous streaming perception, memory, and reasoning remains largely unexplored. Current MLLMs are constrained by their sequence-to-sequence architecture, which limits their ability to process inputs and generate responses simultaneously, akin to being unable to think while perceiving. Furthermore, relying on long contexts to store historical data is impractical for long-term interactions, as retaining all information becomes costly and inefficient. Therefore, rather than relying on a single foundation model to perform all functions, this project draws inspiration from the concept of the Specialized Generalist AI and introduces disentangled streaming perception, reasoning, and memory mechanisms, enabling real-time interaction with streaming video and audio input. The proposed framework InternLM-XComposer2.5-OmniLive (IXC2.5-OL) consists of three key modules: (1) Streaming Perception Module: Processes multimodal information in real-time, storing key details in memory and triggering reasoning in response to user queries. (2) Multi-modal Long Memory Module: Integrates short-term and long-term memory, compressing short-term memories into long-term ones for efficient retrieval and improved accuracy. (3) Reasoning Module: Responds to queries and executes reasoning tasks, coordinating with the perception and memory modules. This project simulates human-like cognition, enabling multimodal large language models to provide continuous and adaptive service over time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09596v1-abstract-full').style.display = 'none'; document.getElementById('2412.09596v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Github Repo: https://github.com/InternLM/InternLM-XComposer/tree/main/InternLM-XComposer-2.5-OmniLive</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07924">arXiv:2412.07924</a> <span> [<a href="https://arxiv.org/pdf/2412.07924">pdf</a>, <a href="https://arxiv.org/format/2412.07924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> A large language model-based approach to quantifying the effects of social determinants in liver transplant decisions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Robitschek%2C+E">Emily Robitschek</a>, <a href="/search/cs?searchtype=author&query=Bastani%2C+A">Asal Bastani</a>, <a href="/search/cs?searchtype=author&query=Horwath%2C+K">Kathryn Horwath</a>, <a href="/search/cs?searchtype=author&query=Sordean%2C+S">Savyon Sordean</a>, <a href="/search/cs?searchtype=author&query=Pletcher%2C+M+J">Mark J. Pletcher</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+J+C">Jennifer C. Lai</a>, <a href="/search/cs?searchtype=author&query=Galletta%2C+S">Sergio Galletta</a>, <a href="/search/cs?searchtype=author&query=Ash%2C+E">Elliott Ash</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jin Ge</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+I+Y">Irene Y. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07924v2-abstract-short" style="display: inline;"> Patient life circumstances, including social determinants of health (SDOH), shape both health outcomes and care access, contributing to persistent disparities across gender, race, and socioeconomic status. Liver transplantation exemplifies these challenges, requiring complex eligibility and allocation decisions where SDOH directly influence patient evaluation. We developed an artificial intelligen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07924v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07924v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07924v2-abstract-full" style="display: none;"> Patient life circumstances, including social determinants of health (SDOH), shape both health outcomes and care access, contributing to persistent disparities across gender, race, and socioeconomic status. Liver transplantation exemplifies these challenges, requiring complex eligibility and allocation decisions where SDOH directly influence patient evaluation. We developed an artificial intelligence (AI)-driven framework to analyze how broadly defined SDOH -- encompassing both traditional social determinants and transplantation-related psychosocial factors -- influence patient care trajectories. Using large language models, we extracted 23 SDOH factors related to patient eligibility for liver transplantation from psychosocial evaluation notes. These SDOH ``snapshots'' significantly improve prediction of patient progression through transplantation evaluation stages and help explain liver transplantation decisions including the recommendation based on psychosocial evaluation and the listing of a patient for a liver transplantation. Our analysis helps identify patterns of SDOH prevalence across demographics that help explain racial disparities in liver transplantation decisions. We highlight specific unmet patient needs, which, if addressed, could improve the equity and efficacy of transplant care. While developed for liver transplantation, this systematic approach to analyzing previously unstructured information about patient circumstances and clinical decision-making could inform understanding of care decisions and disparities across various medical domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07924v2-abstract-full').style.display = 'none'; document.getElementById('2412.07924v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Spotlight Paper, ML4H 2024; Leonidas H. Berry Health Equity Research Award, ACG 2024; Plenary Presentation, AASLD 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05271">arXiv:2412.05271</a> <span> [<a href="https://arxiv.org/pdf/2412.05271">pdf</a>, <a href="https://arxiv.org/format/2412.05271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yue Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yangzhou Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+E">Erfei Cui</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinguo Zhu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shenglong Ye</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaoyang Liu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lixin Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuehui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qingyun Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yimin Ren</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiapeng Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+T">Tan Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingcheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+H">Han Lv</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wenqi Shao</a> , et al. (17 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05271v4-abstract-short" style="display: inline;"> We introduce InternVL 2.5, an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0, maintaining its core model architecture while introducing significant enhancements in training and testing strategies as well as data quality. In this work, we delve into the relationship between model scaling and performance, systematically exploring the performance trends in vision… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05271v4-abstract-full').style.display = 'inline'; document.getElementById('2412.05271v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05271v4-abstract-full" style="display: none;"> We introduce InternVL 2.5, an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0, maintaining its core model architecture while introducing significant enhancements in training and testing strategies as well as data quality. In this work, we delve into the relationship between model scaling and performance, systematically exploring the performance trends in vision encoders, language models, dataset sizes, and test-time configurations. Through extensive evaluations on a wide range of benchmarks, including multi-discipline reasoning, document understanding, multi-image / video understanding, real-world comprehension, multimodal hallucination detection, visual grounding, multilingual capabilities, and pure language processing, InternVL 2.5 exhibits competitive performance, rivaling leading commercial models such as GPT-4o and Claude-3.5-Sonnet. Notably, our model is the first open-source MLLMs to surpass 70% on the MMMU benchmark, achieving a 3.7-point improvement through Chain-of-Thought (CoT) reasoning and showcasing strong potential for test-time scaling. We hope this model contributes to the open-source community by setting new standards for developing and applying multimodal AI systems. HuggingFace demo see https://huggingface.co/spaces/OpenGVLab/InternVL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05271v4-abstract-full').style.display = 'none'; document.getElementById('2412.05271v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18290">arXiv:2411.18290</a> <span> [<a href="https://arxiv.org/pdf/2411.18290">pdf</a>, <a href="https://arxiv.org/format/2411.18290">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Semantic Asymmetry for Precise Gross Tumor Volume Segmentation of Nasopharyngeal Carcinoma in Planning CT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zi Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeli Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tai Ma</a>, <a href="/search/cs?searchtype=author&query=Mok%2C+T+C+W">Tony C. W. Mok</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yan-Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yunhai Bai</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhinlin Zheng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yirui Wang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jia Ge</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xianghua Ye</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Senxiang Yan</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+D">Dakai Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18290v2-abstract-short" style="display: inline;"> In the radiation therapy of nasopharyngeal carcinoma (NPC), clinicians typically delineate the gross tumor volume (GTV) using non-contrast planning computed tomography to ensure accurate radiation dose delivery. However, the low contrast between tumors and adjacent normal tissues necessitates that radiation oncologists manually delineate the tumors, often relying on diagnostic MRI for guidance. %… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18290v2-abstract-full').style.display = 'inline'; document.getElementById('2411.18290v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18290v2-abstract-full" style="display: none;"> In the radiation therapy of nasopharyngeal carcinoma (NPC), clinicians typically delineate the gross tumor volume (GTV) using non-contrast planning computed tomography to ensure accurate radiation dose delivery. However, the low contrast between tumors and adjacent normal tissues necessitates that radiation oncologists manually delineate the tumors, often relying on diagnostic MRI for guidance. % In this study, we propose a novel approach to directly segment NPC gross tumors on non-contrast planning CT images, circumventing potential registration errors when aligning MRI or MRI-derived tumor masks to planning CT. To address the low contrast issues between tumors and adjacent normal structures in planning CT, we introduce a 3D Semantic Asymmetry Tumor segmentation (SATs) method. Specifically, we posit that a healthy nasopharyngeal region is characteristically bilaterally symmetric, whereas the emergence of nasopharyngeal carcinoma disrupts this symmetry. Then, we propose a Siamese contrastive learning segmentation framework that minimizes the voxel-wise distance between original and flipped areas without tumor and encourages a larger distance between original and flipped areas with tumor. Thus, our approach enhances the sensitivity of features to semantic asymmetries. % Extensive experiments demonstrate that the proposed SATs achieves the leading NPC GTV segmentation performance in both internal and external testing, \emph{e.g.}, with at least 2\% absolute Dice score improvement and 12\% average distance error reduction when compared to other state-of-the-art methods in the external testing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18290v2-abstract-full').style.display = 'none'; document.getElementById('2411.18290v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02715">arXiv:2411.02715</a> <span> [<a href="https://arxiv.org/pdf/2411.02715">pdf</a>, <a href="https://arxiv.org/format/2411.02715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CIT: Rethinking Class-incremental Semantic Segmentation with a Class Independent Transformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jinchao Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Akide Liu</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+M+H">Minh Hieu Phan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Y">Yangyang Shu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02715v1-abstract-short" style="display: inline;"> Class-incremental semantic segmentation (CSS) requires that a model learn to segment new classes without forgetting how to segment previous ones: this is typically achieved by distilling the current knowledge and incorporating the latest data. However, bypassing iterative distillation by directly transferring outputs of initial classes to the current learning task is not supported in existing clas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02715v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02715v1-abstract-full" style="display: none;"> Class-incremental semantic segmentation (CSS) requires that a model learn to segment new classes without forgetting how to segment previous ones: this is typically achieved by distilling the current knowledge and incorporating the latest data. However, bypassing iterative distillation by directly transferring outputs of initial classes to the current learning task is not supported in existing class-specific CSS methods. Via Softmax, they enforce dependency between classes and adjust the output distribution at each learning step, resulting in a large probability distribution gap between initial and current tasks. We introduce a simple, yet effective Class Independent Transformation (CIT) that converts the outputs of existing semantic segmentation models into class-independent forms with negligible cost or performance loss. By utilizing class-independent predictions facilitated by CIT, we establish an accumulative distillation framework, ensuring equitable incorporation of all class information. We conduct extensive experiments on various segmentation architectures, including DeepLabV3, Mask2Former, and SegViTv2. Results from these experiments show minimal task forgetting across different datasets, with less than 5% for ADE20K in the most challenging 11 task configurations and less than 1% across all configurations for the PASCAL VOC 2012 dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02715v1-abstract-full').style.display = 'none'; document.getElementById('2411.02715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02619">arXiv:2411.02619</a> <span> [<a href="https://arxiv.org/pdf/2411.02619">pdf</a>, <a href="https://arxiv.org/format/2411.02619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tracking Tumors under Deformation from Partial Point Clouds using Occupancy Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Henrich%2C+P">Pit Henrich</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiawei Liu</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiawei Ge</a>, <a href="/search/cs?searchtype=author&query=Schmidgall%2C+S">Samuel Schmidgall</a>, <a href="/search/cs?searchtype=author&query=Shepard%2C+L">Lauren Shepard</a>, <a href="/search/cs?searchtype=author&query=Ghazi%2C+A+E">Ahmed Ezzat Ghazi</a>, <a href="/search/cs?searchtype=author&query=Mathis-Ullrich%2C+F">Franziska Mathis-Ullrich</a>, <a href="/search/cs?searchtype=author&query=Krieger%2C+A">Axel Krieger</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02619v1-abstract-short" style="display: inline;"> To track tumors during surgery, information from preoperative CT scans is used to determine their position. However, as the surgeon operates, the tumor may be deformed which presents a major hurdle for accurately resecting the tumor, and can lead to surgical inaccuracy, increased operation time, and excessive margins. This issue is particularly pronounced in robot-assisted partial nephrectomy (RAP… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02619v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02619v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02619v1-abstract-full" style="display: none;"> To track tumors during surgery, information from preoperative CT scans is used to determine their position. However, as the surgeon operates, the tumor may be deformed which presents a major hurdle for accurately resecting the tumor, and can lead to surgical inaccuracy, increased operation time, and excessive margins. This issue is particularly pronounced in robot-assisted partial nephrectomy (RAPN), where the kidney undergoes significant deformations during operation. Toward addressing this, we introduce a occupancy network-based method for the localization of tumors within kidney phantoms undergoing deformations at interactive speeds. We validate our method by introducing a 3D hydrogel kidney phantom embedded with exophytic and endophytic renal tumors. It closely mimics real tissue mechanics to simulate kidney deformation during in vivo surgery, providing excellent contrast and clear delineation of tumor margins to enable automatic threshold-based segmentation. Our findings indicate that the proposed method can localize tumors in moderately deforming kidneys with a margin of 6mm to 10mm, while providing essential volumetric 3D information at over 60Hz. This capability directly enables downstream tasks such as robotic resection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02619v1-abstract-full').style.display = 'none'; document.getElementById('2411.02619v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at IROS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13647">arXiv:2410.13647</a> <span> [<a href="https://arxiv.org/pdf/2410.13647">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Multimodal growth and development assessment model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ying Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zichen Song</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zijie Gong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Sitan Huang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiewei Ge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13647v1-abstract-short" style="display: inline;"> With the development of social economy and the improvement of people's attention to health, the growth and development of children and adolescents has become an important indicator to measure the level of national health. Therefore, accurate and timely assessment of children's growth and development has become increasingly important. At the same time, global health inequalities, especially child m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13647v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13647v1-abstract-full" style="display: none;"> With the development of social economy and the improvement of people's attention to health, the growth and development of children and adolescents has become an important indicator to measure the level of national health. Therefore, accurate and timely assessment of children's growth and development has become increasingly important. At the same time, global health inequalities, especially child malnutrition and stunting in developing countries, urgently require effective assessment tools to monitor and intervene. In recent years, the rapid development of technologies such as big data, artificial intelligence, and cloud computing, and the cross-integration of multiple disciplines such as biomedicine, statistics, and computer science have promoted the rapid development of large-scale models for growth and development assessment. However, there are still problems such as too single evaluation factors, inaccurate diagnostic results, and inability to give accurate and reasonable recommendations. The multi-modal growth and development assessment model uses the public data set of RSNA ( North American College of Radiology ) as the training set, and the data set of the Department of Pediatrics of Huaibei People's Hospital as the open source test set. The embedded ICL module enables the model to quickly adapt and identify the tasks that need to be done to ensure that under the premise of considering multiple evaluation factors, accurate diagnosis results and reasonable medical recommendations are given, so as to provide solutions to the above problems and promote the development of the medical field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13647v1-abstract-full').style.display = 'none'; document.getElementById('2410.13647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 Pages 7 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06351">arXiv:2410.06351</a> <span> [<a href="https://arxiv.org/pdf/2410.06351">pdf</a>, <a href="https://arxiv.org/format/2410.06351">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Moving Faster and Reducing Risk: Using LLMs in Release Deployment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abreu%2C+R">Rui Abreu</a>, <a href="/search/cs?searchtype=author&query=Murali%2C+V">Vijayaraghavan Murali</a>, <a href="/search/cs?searchtype=author&query=Rigby%2C+P+C">Peter C Rigby</a>, <a href="/search/cs?searchtype=author&query=Maddila%2C+C">Chandra Maddila</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Weiyan Sun</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jun Ge</a>, <a href="/search/cs?searchtype=author&query=Chinniah%2C+K">Kaavya Chinniah</a>, <a href="/search/cs?searchtype=author&query=Mockus%2C+A">Audris Mockus</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+M">Megh Mehta</a>, <a href="/search/cs?searchtype=author&query=Nagappan%2C+N">Nachiappan Nagappan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06351v1-abstract-short" style="display: inline;"> Release engineering has traditionally focused on continuously delivering features and bug fixes to users, but at a certain scale, it becomes impossible for a release engineering team to determine what should be released. At Meta's scale, the responsibility appropriately and necessarily falls back on the engineer writing and reviewing the code. To address this challenge, we developed models of diff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06351v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06351v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06351v1-abstract-full" style="display: none;"> Release engineering has traditionally focused on continuously delivering features and bug fixes to users, but at a certain scale, it becomes impossible for a release engineering team to determine what should be released. At Meta's scale, the responsibility appropriately and necessarily falls back on the engineer writing and reviewing the code. To address this challenge, we developed models of diff risk scores (DRS) to determine how likely a diff is to cause a SEV, i.e., a severe fault that impacts end-users. Assuming that SEVs are only caused by diffs, a naive model could randomly gate X% of diffs from landing, which would automatically catch X% of SEVs on average. However, we aimed to build a model that can capture Y% of SEVs by gating X% of diffs, where Y >> X. By training the model on historical data on diffs that have caused SEVs in the past, we can predict the riskiness of an outgoing diff to cause a SEV. Diffs that are beyond a particular threshold of risk can then be gated. We have four types of gating: no gating (green), weekend gating (weekend), medium impact on end-users (yellow), and high impact on end-users (red). The input parameter for our models is the level of gating, and the outcome measure is the number of captured SEVs. Our research approaches include a logistic regression model, a BERT-based model, and generative LLMs. Our baseline regression model captures 18.7%, 27.9%, and 84.6% of SEVs while respectively gating the top 5% (weekend), 10% (yellow), and 50% (red) of risky diffs. The BERT-based model, StarBERT, only captures 0.61x, 0.85x, and 0.81x as many SEVs as the logistic regression for the weekend, yellow, and red gating zones, respectively. The generative LLMs, iCodeLlama-34B and iDiffLlama-13B, when risk-aligned, capture more SEVs than the logistic regression model in production: 1.40x, 1.52x, 1.05x, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06351v1-abstract-full').style.display = 'none'; document.getElementById('2410.06351v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14744">arXiv:2408.14744</a> <span> [<a href="https://arxiv.org/pdf/2408.14744">pdf</a>, <a href="https://arxiv.org/format/2408.14744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RSTeller: Scaling Up Visual Language Modeling in Remote Sensing with Rich Linguistic Semantics from Openly Available Data and Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Junyao Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yang Zheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+K">Kaitai Guo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jimin Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14744v2-abstract-short" style="display: inline;"> Abundant, well-annotated multimodal data in remote sensing are pivotal for aligning complex visual remote sensing (RS) scenes with human language, enabling the development of specialized vision language models across diverse RS interpretation tasks. However, annotating RS images with rich linguistic semantics at scale demands expertise in RS and substantial human labor, making it costly and often… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14744v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14744v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14744v2-abstract-full" style="display: none;"> Abundant, well-annotated multimodal data in remote sensing are pivotal for aligning complex visual remote sensing (RS) scenes with human language, enabling the development of specialized vision language models across diverse RS interpretation tasks. However, annotating RS images with rich linguistic semantics at scale demands expertise in RS and substantial human labor, making it costly and often impractical. In this study, we propose a workflow that leverages large language models (LLMs) to generate multimodal datasets with semantically rich captions at scale from plain OpenStreetMap (OSM) data for images sourced from the Google Earth Engine (GEE) platform. This approach facilitates the generation of paired remote sensing data and can be readily scaled up using openly available data. Within this framework, we present RSTeller, a multimodal dataset comprising over 1.3 million RS images, each accompanied by two descriptive captions. Extensive experiments demonstrate that RSTeller enhances the performance of multiple existing vision language models for RS scene understanding through continual pre-training. Our methodology significantly reduces the manual effort and expertise needed for annotating remote sensing imagery while democratizing access to high-quality annotated data. This advancement fosters progress in visual language modeling and encourages broader participation in remote sensing research and applications. The RSTeller dataset is available at https://github.com/SlytherinGe/RSTeller. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14744v2-abstract-full').style.display = 'none'; document.getElementById('2408.14744v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ISPRS</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.8; I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13491">arXiv:2408.13491</a> <span> [<a href="https://arxiv.org/pdf/2408.13491">pdf</a>, <a href="https://arxiv.org/format/2408.13491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ESA: Annotation-Efficient Active Learning for Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jinchao Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+M+H">Minh Hieu Phan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Akide Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13491v1-abstract-short" style="display: inline;"> Active learning enhances annotation efficiency by selecting the most revealing samples for labeling, thereby reducing reliance on extensive human input. Previous methods in semantic segmentation have centered on individual pixels or small areas, neglecting the rich patterns in natural images and the power of advanced pre-trained models. To address these challenges, we propose three key contributio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13491v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13491v1-abstract-full" style="display: none;"> Active learning enhances annotation efficiency by selecting the most revealing samples for labeling, thereby reducing reliance on extensive human input. Previous methods in semantic segmentation have centered on individual pixels or small areas, neglecting the rich patterns in natural images and the power of advanced pre-trained models. To address these challenges, we propose three key contributions: Firstly, we introduce Entity-Superpixel Annotation (ESA), an innovative and efficient active learning strategy which utilizes a class-agnostic mask proposal network coupled with super-pixel grouping to capture local structural cues. Additionally, our method selects a subset of entities within each image of the target domain, prioritizing superpixels with high entropy to ensure comprehensive representation. Simultaneously, it focuses on a limited number of key entities, thereby optimizing for efficiency. By utilizing an annotator-friendly design that capitalizes on the inherent structure of images, our approach significantly outperforms existing pixel-based methods, achieving superior results with minimal queries, specifically reducing click cost by 98% and enhancing performance by 1.71%. For instance, our technique requires a mere 40 clicks for annotation, a stark contrast to the 5000 clicks demanded by conventional methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13491v1-abstract-full').style.display = 'none'; document.getElementById('2408.13491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09474">arXiv:2408.09474</a> <span> [<a href="https://arxiv.org/pdf/2408.09474">pdf</a>, <a href="https://arxiv.org/format/2408.09474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Image-Based Geolocation Using Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Junchen Ding</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+G">Gelei Deng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuekang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Weisong Sun</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yaowen Zheng</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jingquan Ge</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09474v1-abstract-short" style="display: inline;"> Geolocation is now a vital aspect of modern life, offering numerous benefits but also presenting serious privacy concerns. The advent of large vision-language models (LVLMs) with advanced image-processing capabilities introduces new risks, as these models can inadvertently reveal sensitive geolocation information. This paper presents the first in-depth study analyzing the challenges posed by tradi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09474v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09474v1-abstract-full" style="display: none;"> Geolocation is now a vital aspect of modern life, offering numerous benefits but also presenting serious privacy concerns. The advent of large vision-language models (LVLMs) with advanced image-processing capabilities introduces new risks, as these models can inadvertently reveal sensitive geolocation information. This paper presents the first in-depth study analyzing the challenges posed by traditional deep learning and LVLM-based geolocation methods. Our findings reveal that LVLMs can accurately determine geolocations from images, even without explicit geographic training. To address these challenges, we introduce \tool{}, an innovative framework that significantly enhances image-based geolocation accuracy. \tool{} employs a systematic chain-of-thought (CoT) approach, mimicking human geoguessing strategies by carefully analyzing visual and contextual cues such as vehicle types, architectural styles, natural landscapes, and cultural elements. Extensive testing on a dataset of 50,000 ground-truth data points shows that \tool{} outperforms both traditional models and human benchmarks in accuracy. It achieves an impressive average score of 4550.5 in the GeoGuessr game, with an 85.37\% win rate, and delivers highly precise geolocation predictions, with the closest distances as accurate as 0.3 km. Furthermore, our study highlights issues related to dataset integrity, leading to the creation of a more robust dataset and a refined framework that leverages LVLMs' cognitive capabilities to improve geolocation precision. These findings underscore \tool{}'s superior ability to interpret complex visual data, the urgent need to address emerging security vulnerabilities posed by LVLMs, and the importance of responsible AI development to ensure user privacy protection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09474v1-abstract-full').style.display = 'none'; document.getElementById('2408.09474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07894">arXiv:2408.07894</a> <span> [<a href="https://arxiv.org/pdf/2408.07894">pdf</a>, <a href="https://arxiv.org/format/2408.07894">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> System States Forecasting of Microservices with Dynamic Spatio-Temporal Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yifei Xu</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jingguo Ge</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Haina Tang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shuai Ding</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tong Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hui Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07894v1-abstract-short" style="display: inline;"> In the AIOps (Artificial Intelligence for IT Operations) era, accurately forecasting system states is crucial. In microservices systems, this task encounters the challenge of dynamic and complex spatio-temporal relationships among microservice instances, primarily due to dynamic deployments, diverse call paths, and cascading effects among instances. Current time-series forecasting methods, which f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07894v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07894v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07894v1-abstract-full" style="display: none;"> In the AIOps (Artificial Intelligence for IT Operations) era, accurately forecasting system states is crucial. In microservices systems, this task encounters the challenge of dynamic and complex spatio-temporal relationships among microservice instances, primarily due to dynamic deployments, diverse call paths, and cascading effects among instances. Current time-series forecasting methods, which focus mainly on intrinsic patterns, are insufficient in environments where spatial relationships are critical. Similarly, spatio-temporal graph approaches often neglect the nature of temporal trend, concentrating mostly on message passing between nodes. Moreover, current research in microservices domain frequently underestimates the importance of network metrics and topological structures in capturing the evolving dynamics of systems. This paper introduces STMformer, a model tailored for forecasting system states in microservices environments, capable of handling multi-node and multivariate time series. Our method leverages dynamic network connection data and topological information to assist in modeling the intricate spatio-temporal relationships within the system. Additionally, we integrate the PatchCrossAttention module to compute the impact of cascading effects globally. We have developed a dataset based on a microservices system and conducted comprehensive experiments with STMformer against leading methods. In both short-term and long-term forecasting tasks, our model consistently achieved a 8.6% reduction in MAE(Mean Absolute Error) and a 2.2% reduction in MSE (Mean Squared Error). The source code is available at https://github.com/xuyifeiiie/STMformer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07894v1-abstract-full').style.display = 'none'; document.getElementById('2408.07894v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07365">arXiv:2407.07365</a> <span> [<a href="https://arxiv.org/pdf/2407.07365">pdf</a>, <a href="https://arxiv.org/format/2407.07365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> High-Resolution Cloud Detection Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingsheng Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+T">Tianxiang Xue</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiayi Zhao</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jingmin Ge</a>, <a href="/search/cs?searchtype=author&query=Min%2C+Y">Yufang Min</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wei Su</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+K">Kun Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07365v1-abstract-short" style="display: inline;"> The complexity of clouds, particularly in terms of texture detail at high resolutions, has not been well explored by most existing cloud detection networks. This paper introduces the High-Resolution Cloud Detection Network (HR-cloud-Net), which utilizes a hierarchical high-resolution integration approach. HR-cloud-Net integrates a high-resolution representation module, layer-wise cascaded feature… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07365v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07365v1-abstract-full" style="display: none;"> The complexity of clouds, particularly in terms of texture detail at high resolutions, has not been well explored by most existing cloud detection networks. This paper introduces the High-Resolution Cloud Detection Network (HR-cloud-Net), which utilizes a hierarchical high-resolution integration approach. HR-cloud-Net integrates a high-resolution representation module, layer-wise cascaded feature fusion module, and multi-resolution pyramid pooling module to effectively capture complex cloud features. This architecture preserves detailed cloud texture information while facilitating feature exchange across different resolutions, thereby enhancing overall performance in cloud detection. Additionally, a novel approach is introduced wherein a student view, trained on noisy augmented images, is supervised by a teacher view processing normal images. This setup enables the student to learn from cleaner supervisions provided by the teacher, leading to improved performance. Extensive evaluations on three optical satellite image cloud detection datasets validate the superior performance of HR-cloud-Net compared to existing methods.The source code is available at \url{https://github.com/kunzhan/HR-cloud-Net}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07365v1-abstract-full').style.display = 'none'; document.getElementById('2407.07365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Journal of Electronic Imaging</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05463">arXiv:2407.05463</a> <span> [<a href="https://arxiv.org/pdf/2407.05463">pdf</a>, <a href="https://arxiv.org/format/2407.05463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Training Task Experts through Retrieval Based Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiaxin Ge</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xueying Jia</a>, <a href="/search/cs?searchtype=author&query=Viswanathan%2C+V">Vijay Viswanathan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Hongyin Luo</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05463v1-abstract-short" style="display: inline;"> One of the most reliable ways to create deployable models for specialized tasks is to obtain an adequate amount of high-quality task-specific data. However, for specialized tasks, often such datasets do not exist. Existing methods address this by creating such data from large language models (LLMs) and then distilling such knowledge into smaller models. However, these methods are limited by the qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05463v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05463v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05463v1-abstract-full" style="display: none;"> One of the most reliable ways to create deployable models for specialized tasks is to obtain an adequate amount of high-quality task-specific data. However, for specialized tasks, often such datasets do not exist. Existing methods address this by creating such data from large language models (LLMs) and then distilling such knowledge into smaller models. However, these methods are limited by the quality of the LLMs output, and tend to generate repetitive or incorrect data. In this work, we present Retrieval Based Distillation (ReBase), a method that first retrieves data from rich online sources and then transforms them into domain-specific data. This method greatly enhances data diversity. Moreover, ReBase generates Chain-of-Thought reasoning and distills the reasoning capacity of LLMs. We test our method on 4 benchmarks and results show that our method significantly improves performance by up to 7.8% on SQuAD, 1.37% on MNLI, and 1.94% on BigBench-Hard. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05463v1-abstract-full').style.display = 'none'; document.getElementById('2407.05463v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03595">arXiv:2407.03595</a> <span> [<a href="https://arxiv.org/pdf/2407.03595">pdf</a>, <a href="https://arxiv.org/format/2407.03595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="General Economics">econ.GN</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Machine Learning for Economic Forecasting: An Application to China's GDP Growth </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yanqing Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xingcheng Xu</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jinfeng Ge</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03595v1-abstract-short" style="display: inline;"> This paper aims to explore the application of machine learning in forecasting Chinese macroeconomic variables. Specifically, it employs various machine learning models to predict the quarterly real GDP growth of China, and analyzes the factors contributing to the performance differences among these models. Our findings indicate that the average forecast errors of machine learning models are genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03595v1-abstract-full').style.display = 'inline'; document.getElementById('2407.03595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03595v1-abstract-full" style="display: none;"> This paper aims to explore the application of machine learning in forecasting Chinese macroeconomic variables. Specifically, it employs various machine learning models to predict the quarterly real GDP growth of China, and analyzes the factors contributing to the performance differences among these models. Our findings indicate that the average forecast errors of machine learning models are generally lower than those of traditional econometric models or expert forecasts, particularly in periods of economic stability. However, during certain inflection points, although machine learning models still outperform traditional econometric models, expert forecasts may exhibit greater accuracy in some instances due to experts' more comprehensive understanding of the macroeconomic environment and real-time economic variables. In addition to macroeconomic forecasting, this paper employs interpretable machine learning methods to identify the key attributive variables from different machine learning models, aiming to enhance the understanding and evaluation of their contributions to macroeconomic fluctuations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03595v1-abstract-full').style.display = 'none'; document.getElementById('2407.03595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14887">arXiv:2406.14887</a> <span> [<a href="https://arxiv.org/pdf/2406.14887">pdf</a>, <a href="https://arxiv.org/format/2406.14887">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> InternLM-Law: An Open Source Chinese Legal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fei%2C+Z">Zhiwei Fei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Songyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiaoyu Shen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dawei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M">Maosong Cao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Fengzhe Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yining Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jidong Ge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14887v1-abstract-short" style="display: inline;"> While large language models (LLMs) have showcased impressive capabilities, they struggle with addressing legal queries due to the intricate complexities and specialized expertise required in the legal field. In this paper, we introduce InternLM-Law, a specialized LLM tailored for addressing diverse legal queries related to Chinese laws, spanning from responding to standard legal questions (e.g., l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14887v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14887v1-abstract-full" style="display: none;"> While large language models (LLMs) have showcased impressive capabilities, they struggle with addressing legal queries due to the intricate complexities and specialized expertise required in the legal field. In this paper, we introduce InternLM-Law, a specialized LLM tailored for addressing diverse legal queries related to Chinese laws, spanning from responding to standard legal questions (e.g., legal exercises in textbooks) to analyzing complex real-world legal situations. We meticulously construct a dataset in the Chinese legal domain, encompassing over 1 million queries, and implement a data filtering and processing pipeline to ensure its diversity and quality. Our training approach involves a novel two-stage process: initially fine-tuning LLMs on both legal-specific and general-purpose content to equip the models with broad knowledge, followed by exclusive fine-tuning on high-quality legal data to enhance structured output generation. InternLM-Law achieves the highest average performance on LawBench, outperforming state-of-the-art models, including GPT-4, on 13 out of 20 subtasks. We make InternLM-Law and our dataset publicly available to facilitate future research in applying LLMs within the legal domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14887v1-abstract-full').style.display = 'none'; document.getElementById('2406.14887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our dataset, code and models will be released at https://github.com/InternLM/InternLM-Law</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04330">arXiv:2406.04330</a> <span> [<a href="https://arxiv.org/pdf/2406.04330">pdf</a>, <a href="https://arxiv.org/format/2406.04330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Parameter-Inverted Image Pyramid Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xizhou Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xue Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaokai Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+W">Wenhan Dou</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Junqi Ge</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lewei Lu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04330v2-abstract-short" style="display: inline;"> Image pyramids are commonly used in modern computer vision tasks to obtain multi-scale features for precise understanding of images. However, image pyramids process multiple resolutions of images using the same large-scale model, which requires significant computational cost. To overcome this issue, we propose a novel network architecture known as the Parameter-Inverted Image Pyramid Networks (PII… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04330v2-abstract-full').style.display = 'inline'; document.getElementById('2406.04330v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04330v2-abstract-full" style="display: none;"> Image pyramids are commonly used in modern computer vision tasks to obtain multi-scale features for precise understanding of images. However, image pyramids process multiple resolutions of images using the same large-scale model, which requires significant computational cost. To overcome this issue, we propose a novel network architecture known as the Parameter-Inverted Image Pyramid Networks (PIIP). Our core idea is to use models with different parameter sizes to process different resolution levels of the image pyramid, thereby balancing computational efficiency and performance. Specifically, the input to PIIP is a set of multi-scale images, where higher resolution images are processed by smaller networks. We further propose a feature interaction mechanism to allow features of different resolutions to complement each other and effectively integrate information from different spatial scales. Extensive experiments demonstrate that the PIIP achieves superior performance in tasks such as object detection, segmentation, and image classification, compared to traditional image pyramid methods and single-branch networks, while reducing computational cost. Notably, when applying our method on a large-scale vision foundation model InternViT-6B, we improve its performance by 1%-2% on detection and segmentation with only 40%-60% of the original computation. These results validate the effectiveness of the PIIP approach and provide a new technical direction for future vision computing tasks. Our code and models are available at https://github.com/OpenGVLab/PIIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04330v2-abstract-full').style.display = 'none'; document.getElementById('2406.04330v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04201">arXiv:2406.04201</a> <span> [<a href="https://arxiv.org/pdf/2406.04201">pdf</a>, <a href="https://arxiv.org/ps/2406.04201">ps</a>, <a href="https://arxiv.org/format/2406.04201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Securing Equal Share: A Principled Approach for Learning Multiplayer Symmetric Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiawei Ge</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuanhao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenzhe Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04201v2-abstract-short" style="display: inline;"> This paper examines multiplayer symmetric constant-sum games with more than two players in a competitive setting, including examples like Mahjong, Poker, and various board and video games. In contrast to two-player zero-sum games, equilibria in multiplayer games are neither unique nor non-exploitable, failing to provide meaningful guarantees when competing against opponents who play different equi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04201v2-abstract-full').style.display = 'inline'; document.getElementById('2406.04201v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04201v2-abstract-full" style="display: none;"> This paper examines multiplayer symmetric constant-sum games with more than two players in a competitive setting, including examples like Mahjong, Poker, and various board and video games. In contrast to two-player zero-sum games, equilibria in multiplayer games are neither unique nor non-exploitable, failing to provide meaningful guarantees when competing against opponents who play different equilibria or non-equilibrium strategies. This gives rise to a series of long-lasting fundamental questions in multiplayer games regarding suitable objectives, solution concepts, and principled algorithms. This paper takes an initial step towards addressing these challenges by focusing on the natural objective of equal share -- securing an expected payoff of C/n in an n-player symmetric game with a total payoff of C. We rigorously identify the theoretical conditions under which achieving an equal share is tractable and design a series of efficient algorithms, inspired by no-regret learning, that provably attain approximate equal share across various settings. Furthermore, we provide complementary lower bounds that justify the sharpness of our theoretical results. Our experimental results highlight worst-case scenarios where meta-algorithms from prior state-of-the-art systems for multiplayer games fail to secure an equal share, while our algorithm succeeds, demonstrating the effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04201v2-abstract-full').style.display = 'none'; document.getElementById('2406.04201v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17418">arXiv:2405.17418</a> <span> [<a href="https://arxiv.org/pdf/2405.17418">pdf</a>, <a href="https://arxiv.org/format/2405.17418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-Corrected Multimodal Large Language Model for End-to-End Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenxuan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanqun Wang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+L">Lily Lee</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kaichen Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sixiang Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chuyan Xiong</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiaxin Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17418v1-abstract-short" style="display: inline;"> Robot manipulation policies have shown unsatisfactory action performance when confronted with novel task or object instances. Hence, the capability to automatically detect and self-correct failure action is essential for a practical robotic system. Recently, Multimodal Large Language Models (MLLMs) have shown promise in visual instruction following and demonstrated strong reasoning abilities in va… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17418v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17418v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17418v1-abstract-full" style="display: none;"> Robot manipulation policies have shown unsatisfactory action performance when confronted with novel task or object instances. Hence, the capability to automatically detect and self-correct failure action is essential for a practical robotic system. Recently, Multimodal Large Language Models (MLLMs) have shown promise in visual instruction following and demonstrated strong reasoning abilities in various tasks. To unleash general MLLMs as an end-to-end robotic agent, we introduce a Self-Corrected (SC)-MLLM, equipping our model not only to predict end-effector poses but also to autonomously recognize and correct failure actions. Specifically, we first conduct parameter-efficient fine-tuning to empower MLLM with pose prediction ability, which is reframed as a language modeling problem. When facing execution failures, our model learns to identify low-level action error causes (i.e., position and rotation errors) and adaptively seeks prompt feedback from experts. Based on the feedback, SC-MLLM rethinks the current failure scene and generates the corrected actions. Furthermore, we design a continuous policy learning method for successfully corrected samples, enhancing the model's adaptability to the current scene configuration and reducing the frequency of expert intervention. To evaluate our SC-MLLM, we conduct extensive experiments in both simulation and real-world settings. SC-MLLM agent significantly improve manipulation accuracy compared to previous state-of-the-art robotic MLLM (ManipLLM), increasing from 57\% to 79\% on seen object categories and from 47\% to 69\% on unseen novel categories. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17418v1-abstract-full').style.display = 'none'; document.getElementById('2405.17418v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10302">arXiv:2405.10302</a> <span> [<a href="https://arxiv.org/pdf/2405.10302">pdf</a>, <a href="https://arxiv.org/format/2405.10302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Optimal Aggregation of Prediction Intervals under Unsupervised Domain Shift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiawei Ge</a>, <a href="/search/cs?searchtype=author&query=Mukherjee%2C+D">Debarghya Mukherjee</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jianqing Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10302v2-abstract-short" style="display: inline;"> As machine learning models are increasingly deployed in dynamic environments, it becomes paramount to assess and quantify uncertainties associated with distribution shifts. A distribution shift occurs when the underlying data-generating process changes, leading to a deviation in the model's performance. The prediction interval, which captures the range of likely outcomes for a given prediction, se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10302v2-abstract-full').style.display = 'inline'; document.getElementById('2405.10302v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10302v2-abstract-full" style="display: none;"> As machine learning models are increasingly deployed in dynamic environments, it becomes paramount to assess and quantify uncertainties associated with distribution shifts. A distribution shift occurs when the underlying data-generating process changes, leading to a deviation in the model's performance. The prediction interval, which captures the range of likely outcomes for a given prediction, serves as a crucial tool for characterizing uncertainties induced by their underlying distribution. In this paper, we propose methodologies for aggregating prediction intervals to obtain one with minimal width and adequate coverage on the target domain under unsupervised domain shift, under which we have labeled samples from a related source domain and unlabeled covariates from the target domain. Our analysis encompasses scenarios where the source and the target domain are related via i) a bounded density ratio, and ii) a measure-preserving transformation. Our proposed methodologies are computationally efficient and easy to implement. Beyond illustrating the performance of our method through real-world datasets, we also delve into the theoretical details. This includes establishing rigorous theoretical guarantees, coupled with finite sample bounds, regarding the coverage and width of our prediction intervals. Our approach excels in practical applications and is underpinned by a solid theoretical framework, ensuring its reliability and effectiveness across diverse contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10302v2-abstract-full').style.display = 'none'; document.getElementById('2405.10302v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04966">arXiv:2405.04966</a> <span> [<a href="https://arxiv.org/pdf/2405.04966">pdf</a>, <a href="https://arxiv.org/format/2405.04966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Communication-Efficient Collaborative Perception via Information Filling with Codebook </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yue Hu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Juntong Peng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sifei Liu</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Junhao Ge</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Si Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siheng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04966v1-abstract-short" style="display: inline;"> Collaborative perception empowers each agent to improve its perceptual ability through the exchange of perceptual messages with other agents. It inherently results in a fundamental trade-off between perception ability and communication cost. To address this bottleneck issue, our core idea is to optimize the collaborative messages from two key aspects: representation and selection. The proposed cod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04966v1-abstract-full').style.display = 'inline'; document.getElementById('2405.04966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04966v1-abstract-full" style="display: none;"> Collaborative perception empowers each agent to improve its perceptual ability through the exchange of perceptual messages with other agents. It inherently results in a fundamental trade-off between perception ability and communication cost. To address this bottleneck issue, our core idea is to optimize the collaborative messages from two key aspects: representation and selection. The proposed codebook-based message representation enables the transmission of integer codes, rather than high-dimensional feature maps. The proposed information-filling-driven message selection optimizes local messages to collectively fill each agent's information demand, preventing information overflow among multiple agents. By integrating these two designs, we propose CodeFilling, a novel communication-efficient collaborative perception system, which significantly advances the perception-communication trade-off and is inclusive to both homogeneous and heterogeneous collaboration settings. We evaluate CodeFilling in both a real-world dataset, DAIR-V2X, and a new simulation dataset, OPV2VH+. Results show that CodeFilling outperforms previous SOTA Where2comm on DAIR-V2X/OPV2VH+ with 1,333/1,206 times lower communication volume. Our code is available at https://github.com/PhyllisH/CodeFilling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04966v1-abstract-full').style.display = 'none'; document.getElementById('2405.04966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, Accepted by CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00696">arXiv:2405.00696</a> <span> [<a href="https://arxiv.org/pdf/2405.00696">pdf</a>, <a href="https://arxiv.org/format/2405.00696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Life-long Learning and Testing for Automated Vehicles via Adaptive Scenario Sampling as A Continuous Optimization Process </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jingwei Ge</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengbo Wang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+D">Danya Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00696v1-abstract-short" style="display: inline;"> Sampling critical testing scenarios is an essential step in intelligence testing for Automated Vehicles (AVs). However, due to the lack of prior knowledge on the distribution of critical scenarios in sampling space, we can hardly efficiently find the critical scenarios or accurately evaluate the intelligence of AVs. To solve this problem, we formulate the testing as a continuous optimization proce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00696v1-abstract-full').style.display = 'inline'; document.getElementById('2405.00696v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00696v1-abstract-full" style="display: none;"> Sampling critical testing scenarios is an essential step in intelligence testing for Automated Vehicles (AVs). However, due to the lack of prior knowledge on the distribution of critical scenarios in sampling space, we can hardly efficiently find the critical scenarios or accurately evaluate the intelligence of AVs. To solve this problem, we formulate the testing as a continuous optimization process which iteratively generates potential critical scenarios and meanwhile evaluates these scenarios. A bi-level loop is proposed for such life-long learning and testing. In the outer loop, we iteratively learn space knowledge by evaluating AV in the already sampled scenarios and then sample new scenarios based on the retained knowledge. Outer loop stops when all generated samples cover the whole space. While to maximize the coverage of the space in each outer loop, we set an inner loop which receives newly generated samples in outer loop and outputs the updated positions of these samples. We assume that points in a small sphere-like subspace can be covered (or represented) by the point in the center of this sphere. Therefore, we can apply a multi-rounds heuristic strategy to move and pack these spheres in space to find the best covering solution. The simulation results show that faster and more accurate evaluation of AVs can be achieved with more critical scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00696v1-abstract-full').style.display = 'none'; document.getElementById('2405.00696v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16611">arXiv:2404.16611</a> <span> [<a href="https://arxiv.org/pdf/2404.16611">pdf</a>, <a href="https://arxiv.org/ps/2404.16611">ps</a>, <a href="https://arxiv.org/format/2404.16611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Towards Symbiotic SAGIN Through Inter-operator Resource and Service Sharing: Joint Orchestration of User Association and Radio Resources </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+S">Shizhao He</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jungang Ge</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Ying-Chang Liang</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16611v1-abstract-short" style="display: inline;"> The space-air-ground integrated network (SAGIN) is a pivotal architecture to support ubiquitous connectivity in the upcoming 6G era. Inter-operator resource and service sharing is a promising way to realize such a huge network, utilizing resources efficiently and reducing construction costs. Given the rationality of operators, the configuration of resources and services in SAGIN should focus on bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16611v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16611v1-abstract-full" style="display: none;"> The space-air-ground integrated network (SAGIN) is a pivotal architecture to support ubiquitous connectivity in the upcoming 6G era. Inter-operator resource and service sharing is a promising way to realize such a huge network, utilizing resources efficiently and reducing construction costs. Given the rationality of operators, the configuration of resources and services in SAGIN should focus on both the overall system performance and individual benefits of operators. Motivated by emerging symbiotic communication facilitating mutual benefits across different radio systems, we investigate the resource and service sharing in SAGIN from a symbiotic communication perspective in this paper. In particular, we consider a SAGIN consisting of a ground network operator (GNO) and a satellite network operator (SNO). Specifically, we aim to maximize the weighted sum rate (WSR) of the whole SAGIN by jointly optimizing the user association, resource allocation, and beamforming. Besides, we introduce a sharing coefficient to characterize the revenue of operators. Operators may suffer revenue loss when only focusing on maximizing the WSR. In pursuit of mutual benefits, we propose a mutual benefit constraint (MBC) to ensure that each operator obtains revenue gains. Then, we develop a centralized algorithm based on the successive convex approximation (SCA) method. Considering that the centralized algorithm is difficult to implement, we propose a distributed algorithm based on Lagrangian dual decomposition and the consensus alternating direction method of multipliers (ADMM). Finally, we provide extensive numerical simulations to demonstrate the effectiveness of the two proposed algorithms, and the distributed optimization algorithm can approach the performance of the centralized one. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16611v1-abstract-full').style.display = 'none'; document.getElementById('2404.16611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09496">arXiv:2404.09496</a> <span> [<a href="https://arxiv.org/pdf/2404.09496">pdf</a>, <a href="https://arxiv.org/format/2404.09496">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Collaborative Autonomous Driving: Simulation Platform and End-to-End System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+G">Genjia Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yue Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenxin Xu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+W">Weibo Mao</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Junhao Ge</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhengxiang Huang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yifan Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yinda Xu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Junkai Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yafei Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siheng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09496v1-abstract-short" style="display: inline;"> Vehicle-to-everything-aided autonomous driving (V2X-AD) has a huge potential to provide a safer driving solution. Despite extensive researches in transportation and communication to support V2X-AD, the actual utilization of these infrastructures and communication resources in enhancing driving performances remains largely unexplored. This highlights the necessity of collaborative autonomous drivin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09496v1-abstract-full').style.display = 'inline'; document.getElementById('2404.09496v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09496v1-abstract-full" style="display: none;"> Vehicle-to-everything-aided autonomous driving (V2X-AD) has a huge potential to provide a safer driving solution. Despite extensive researches in transportation and communication to support V2X-AD, the actual utilization of these infrastructures and communication resources in enhancing driving performances remains largely unexplored. This highlights the necessity of collaborative autonomous driving: a machine learning approach that optimizes the information sharing strategy to improve the driving performance of each vehicle. This effort necessitates two key foundations: a platform capable of generating data to facilitate the training and testing of V2X-AD, and a comprehensive system that integrates full driving-related functionalities with mechanisms for information sharing. From the platform perspective, we present V2Xverse, a comprehensive simulation platform for collaborative autonomous driving. This platform provides a complete pipeline for collaborative driving. From the system perspective, we introduce CoDriving, a novel end-to-end collaborative driving system that properly integrates V2X communication over the entire autonomous pipeline, promoting driving with shared perceptual information. The core idea is a novel driving-oriented communication strategy. Leveraging this strategy, CoDriving improves driving performance while optimizing communication efficiency. We make comprehensive benchmarks with V2Xverse, analyzing both modular performance and closed-loop driving performance. Experimental results show that CoDriving: i) significantly improves the driving score by 62.49% and drastically reduces the pedestrian collision rate by 53.50% compared to the SOTA end-to-end driving method, and ii) achieves sustaining driving performance superiority over dynamic constraint communication conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09496v1-abstract-full').style.display = 'none'; document.getElementById('2404.09496v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06201">arXiv:2404.06201</a> <span> [<a href="https://arxiv.org/pdf/2404.06201">pdf</a>, <a href="https://arxiv.org/format/2404.06201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Open-Source AI-based SE Tools: Opportunities and Challenges of Collaborative Software Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhihao Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wei Ma</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tao Lin</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yaowen Zheng</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jingquan Ge</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Klein%2C+J">Jacques Klein</a>, <a href="/search/cs?searchtype=author&query=Bissyande%2C+T">Tegawende Bissyande</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06201v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have become instrumental in advancing software engineering (SE) tasks, showcasing their efficacy in code understanding and beyond. Like traditional SE tools, open-source collaboration is key in realising the excellent products. However, with AI models, the essential need is in data. The collaboration of these AI-based SE models hinges on maximising the sources of high-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06201v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06201v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have become instrumental in advancing software engineering (SE) tasks, showcasing their efficacy in code understanding and beyond. Like traditional SE tools, open-source collaboration is key in realising the excellent products. However, with AI models, the essential need is in data. The collaboration of these AI-based SE models hinges on maximising the sources of high-quality data. However, data especially of high quality, often holds commercial or sensitive value, making it less accessible for open-source AI-based SE projects. This reality presents a significant barrier to the development and enhancement of AI-based SE tools within the software engineering community. Therefore, researchers need to find solutions for enabling open-source AI-based SE models to tap into resources by different organisations. Addressing this challenge, our position paper investigates one solution to facilitate access to diverse organizational resources for open-source AI models, ensuring privacy and commercial sensitivities are respected. We introduce a governance framework centered on federated learning (FL), designed to foster the joint development and maintenance of open-source AI code models while safeguarding data privacy and security. Additionally, we present guidelines for developers on AI-based SE tool collaboration, covering data requirements, model architecture, updating strategies, and version control. Given the significant influence of data characteristics on FL, our research examines the effect of code data heterogeneity on FL performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06201v1-abstract-full').style.display = 'none'; document.getElementById('2404.06201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.05667">arXiv:2404.05667</a> <span> [<a href="https://arxiv.org/pdf/2404.05667">pdf</a>, <a href="https://arxiv.org/format/2404.05667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-72775-7_9">10.1007/978-3-031-72775-7_9 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> AlignZeg: Mitigating Objective Misalignment for Zero-shot Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiannan Ge</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lingxi Xie</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Hongtao Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pandeng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaopeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongdong Zhang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qi Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.05667v1-abstract-short" style="display: inline;"> A serious issue that harms the performance of zero-shot visual recognition is named objective misalignment, i.e., the learning objective prioritizes improving the recognition accuracy of seen classes rather than unseen classes, while the latter is the true target to pursue. This issue becomes more significant in zero-shot image segmentation because the stronger (i.e., pixel-level) supervision brin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05667v1-abstract-full').style.display = 'inline'; document.getElementById('2404.05667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.05667v1-abstract-full" style="display: none;"> A serious issue that harms the performance of zero-shot visual recognition is named objective misalignment, i.e., the learning objective prioritizes improving the recognition accuracy of seen classes rather than unseen classes, while the latter is the true target to pursue. This issue becomes more significant in zero-shot image segmentation because the stronger (i.e., pixel-level) supervision brings a larger gap between seen and unseen classes. To mitigate it, we propose a novel architecture named AlignZeg, which embodies a comprehensive improvement of the segmentation pipeline, including proposal extraction, classification, and correction, to better fit the goal of zero-shot segmentation. (1) Mutually-Refined Proposal Extraction. AlignZeg harnesses a mutual interaction between mask queries and visual features, facilitating detailed class-agnostic mask proposal extraction. (2) Generalization-Enhanced Proposal Classification. AlignZeg introduces synthetic data and incorporates multiple background prototypes to allocate a more generalizable feature space. (3) Predictive Bias Correction. During the inference stage, AlignZeg uses a class indicator to find potential unseen class proposals followed by a prediction postprocess to correct the prediction bias. Experiments demonstrate that AlignZeg markedly enhances zero-shot semantic segmentation, as shown by an average 3.8% increase in hIoU, primarily attributed to a 7.1% improvement in identifying unseen classes, and we further validate that the improvement comes from alleviating the objective misalignment issue. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05667v1-abstract-full').style.display = 'none'; document.getElementById('2404.05667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ECCV 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01554">arXiv:2404.01554</a> <span> [<a href="https://arxiv.org/pdf/2404.01554">pdf</a>, <a href="https://arxiv.org/format/2404.01554">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3650212.3652130">10.1145/3650212.3652130 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> FT2Ra: A Fine-Tuning-Inspired Approach to Retrieval-Augmented Code Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohong Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiaofei Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shangqing Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Ze Tang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+R">Ruitao Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junjie Wang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jidong Ge</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+L">Lei Bu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01554v1-abstract-short" style="display: inline;"> The rise of code pre-trained models has significantly enhanced various coding tasks, such as code completion, and tools like GitHub Copilot. However, the substantial size of these models, especially large models, poses a significant challenge when it comes to fine-tuning them for specific downstream tasks. As an alternative approach, retrieval-based methods have emerged as a promising solution, au… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01554v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01554v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01554v1-abstract-full" style="display: none;"> The rise of code pre-trained models has significantly enhanced various coding tasks, such as code completion, and tools like GitHub Copilot. However, the substantial size of these models, especially large models, poses a significant challenge when it comes to fine-tuning them for specific downstream tasks. As an alternative approach, retrieval-based methods have emerged as a promising solution, augmenting model predictions without the need for fine-tuning. Despite their potential, a significant challenge is that the designs of these methods often rely on heuristics, leaving critical questions about what information should be stored or retrieved and how to interpolate such information for augmenting predictions. To tackle this challenge, we first perform a theoretical analysis of the fine-tuning process, highlighting the importance of delta logits as a catalyst for improving model predictions. Building on this insight, we develop a novel retrieval-based method, FT2Ra, which aims to mimic genuine fine-tuning. While FT2Ra adopts a retrieval-based mechanism, it uniquely adopts a paradigm with a learning rate and multi-epoch retrievals, which is similar to fine-tuning.In token-level completion, which represents a relatively easier task, FT2Ra achieves a 4.29% improvement in accuracy compared to the best baseline method on UniXcoder. In the more challenging line-level completion task, we observe a substantial more than twice increase in Exact Match (EM) performance, indicating the significant advantages of our theoretical analysis. Notably, even when operating without actual fine-tuning, FT2Ra exhibits competitive performance compared to the models with real fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01554v1-abstract-full').style.display = 'none'; document.getElementById('2404.01554v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ISSTA 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17297">arXiv:2403.17297</a> <span> [<a href="https://arxiv.org/pdf/2403.17297">pdf</a>, <a href="https://arxiv.org/format/2403.17297">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> InternLM2 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zheng Cai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M">Maosong Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haojiong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Keyu Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xin Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xun Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zehui Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+P">Pei Chu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaoyi Dong</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+H">Haodong Duan</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Q">Qi Fan</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+Z">Zhaoye Fei</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiaye Ge</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+C">Chenya Gu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuzhe Gu</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+T">Tao Gui</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+A">Aijia Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qipeng Guo</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yingfan Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Ting Huang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+T">Tao Jiang</a> , et al. (75 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17297v1-abstract-short" style="display: inline;"> The evolution of Large Language Models (LLMs) like ChatGPT and GPT-4 has sparked discussions on the advent of Artificial General Intelligence (AGI). However, replicating such advancements in open-source models has been challenging. This paper introduces InternLM2, an open-source LLM that outperforms its predecessors in comprehensive evaluations across 6 dimensions and 30 benchmarks, long-context m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17297v1-abstract-full').style.display = 'inline'; document.getElementById('2403.17297v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17297v1-abstract-full" style="display: none;"> The evolution of Large Language Models (LLMs) like ChatGPT and GPT-4 has sparked discussions on the advent of Artificial General Intelligence (AGI). However, replicating such advancements in open-source models has been challenging. This paper introduces InternLM2, an open-source LLM that outperforms its predecessors in comprehensive evaluations across 6 dimensions and 30 benchmarks, long-context modeling, and open-ended subjective evaluations through innovative pre-training and optimization techniques. The pre-training process of InternLM2 is meticulously detailed, highlighting the preparation of diverse data types including text, code, and long-context data. InternLM2 efficiently captures long-term dependencies, initially trained on 4k tokens before advancing to 32k tokens in pre-training and fine-tuning stages, exhibiting remarkable performance on the 200k ``Needle-in-a-Haystack" test. InternLM2 is further aligned using Supervised Fine-Tuning (SFT) and a novel Conditional Online Reinforcement Learning from Human Feedback (COOL RLHF) strategy that addresses conflicting human preferences and reward hacking. By releasing InternLM2 models in different training stages and model sizes, we provide the community with insights into the model's evolution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17297v1-abstract-full').style.display = 'none'; document.getElementById('2403.17297v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15588">arXiv:2403.15588</a> <span> [<a href="https://arxiv.org/pdf/2403.15588">pdf</a>, <a href="https://arxiv.org/format/2403.15588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> RIS-assisted Cell-Free Massive MIMO Systems With Two-Timescale Design and Hardware Impairments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jianxin Dai</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jin Ge</a>, <a href="/search/cs?searchtype=author&query=Zhi%2C+K">Kangda Zhi</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+C">Cunhua Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Youguo Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15588v2-abstract-short" style="display: inline;"> Integrating the reconfigurable intelligent surface (RIS) into a cell-free massive multiple-input multiple-output (CF-mMIMO) system is an effective solution to achieve high system capacity with low cost and power consumption. However, existing works of RIS-assisted systems mostly assumed perfect hardware, while the impact of hardware impairments (HWIs) is generally ignored. In this paper, we consid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15588v2-abstract-full').style.display = 'inline'; document.getElementById('2403.15588v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15588v2-abstract-full" style="display: none;"> Integrating the reconfigurable intelligent surface (RIS) into a cell-free massive multiple-input multiple-output (CF-mMIMO) system is an effective solution to achieve high system capacity with low cost and power consumption. However, existing works of RIS-assisted systems mostly assumed perfect hardware, while the impact of hardware impairments (HWIs) is generally ignored. In this paper, we consider the general Rician fading channel and uplink transmission of the RIS-assisted CF-mMIMO system under transceiver impairments and RIS phase noise. To reduce the feedback overhead and power consumption, we propose a two-timescale transmission scheme to optimize the passive beamformers at RISs with statistical channel state information (CSI), while transmit beamformers at access points (APs) are designed based on instantaneous CSI. Also, the maximum ratio combining (MRC) detection is applied to the central processing unit (CPU). On this basis, we derive the closed-form approximate expression of the achievable rate, based on which the impact of HWIs and the power scaling laws are analyzed to draw useful theoretical insights. To maximize the users' sum rate or minimum rate, we first transform our rate expression into a tractable form, and then optimize the phase shifts of RISs based on an accelerated gradient ascent method. Finally, numerical results are presented to demonstrate the correctness of our derived expressions and validate the previous analysis, which provide some guidelines for the practical application of the imperfect RISs in the CF-mMIMO with transceiver HWIs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15588v2-abstract-full').style.display = 'none'; document.getElementById('2403.15588v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">51 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.16026">arXiv:2402.16026</a> <span> [<a href="https://arxiv.org/pdf/2402.16026">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Feature Selection Based on Orthogonal Constraints and Polygon Area </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jun Ge</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zheng Wei</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunjie Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yilei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.16026v1-abstract-short" style="display: inline;"> The goal of feature selection is to choose the optimal subset of features for a recognition task by evaluating the importance of each feature, thereby achieving effective dimensionality reduction. Currently, proposed feature selection methods often overlook the discriminative dependencies between features and labels. To address this problem, this paper introduces a novel orthogonal regression mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16026v1-abstract-full').style.display = 'inline'; document.getElementById('2402.16026v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.16026v1-abstract-full" style="display: none;"> The goal of feature selection is to choose the optimal subset of features for a recognition task by evaluating the importance of each feature, thereby achieving effective dimensionality reduction. Currently, proposed feature selection methods often overlook the discriminative dependencies between features and labels. To address this problem, this paper introduces a novel orthogonal regression model incorporating the area of a polygon. The model can intuitively capture the discriminative dependencies between features and labels. Additionally, this paper employs a hybrid non-monotone linear search method to efficiently tackle the non-convex optimization challenge posed by orthogonal constraints. Experimental results demonstrate that our approach not only effectively captures discriminative dependency information but also surpasses traditional methods in reducing feature dimensions and enhancing classification performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16026v1-abstract-full').style.display = 'none'; document.getElementById('2402.16026v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.07720">arXiv:2402.07720</a> <span> [<a href="https://arxiv.org/pdf/2402.07720">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIV.2024.3400323">10.1109/TIV.2024.3400323 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> VistaScenario: Interaction Scenario Engineering for Vehicles with Intelligent Systems for Transport Automation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+C">Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jingwei Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Junqing Wei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei-Yue Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.07720v2-abstract-short" style="display: inline;"> Intelligent vehicles and autonomous driving systems rely on scenario engineering for intelligence and index (I&I), calibration and certification (C&C), and verification and validation (V&V). To extract and index scenarios, various vehicle interactions are worthy of much attention, and deserve refined descriptions and labels. However, existing methods cannot cope well with the problem of scenario c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07720v2-abstract-full').style.display = 'inline'; document.getElementById('2402.07720v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.07720v2-abstract-full" style="display: none;"> Intelligent vehicles and autonomous driving systems rely on scenario engineering for intelligence and index (I&I), calibration and certification (C&C), and verification and validation (V&V). To extract and index scenarios, various vehicle interactions are worthy of much attention, and deserve refined descriptions and labels. However, existing methods cannot cope well with the problem of scenario classification and labeling with vehicle interactions as the core. In this paper, we propose VistaScenario framework to conduct interaction scenario engineering for vehicles with intelligent systems for transport automation. Based on the summarized basic types of vehicle interactions, we slice scenario data stream into a series of segments via spatiotemporal scenario evolution tree. We also propose the scenario metric Graph-DTW based on Graph Computation Tree and Dynamic Time Warping to conduct refined scenario comparison and labeling. The extreme interaction scenarios and corner cases can be efficiently filtered and extracted. Moreover, with naturalistic scenario datasets, testing examples on trajectory prediction model demonstrate the effectiveness and advantages of our framework. VistaScenario can provide solid support for the usage and indexing of scenario data, further promote the development of intelligent vehicles and transport automation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07720v2-abstract-full').style.display = 'none'; document.getElementById('2402.07720v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Intelligent Vehicles</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.03760">arXiv:2402.03760</a> <span> [<a href="https://arxiv.org/pdf/2402.03760">pdf</a>, <a href="https://arxiv.org/format/2402.03760">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> DeMarking: A Defense for Network Flow Watermarking in Real-Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yali Yuan</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jian Ge</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guang Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.03760v2-abstract-short" style="display: inline;"> The network flow watermarking technique associates the two communicating parties by actively modifying certain characteristics of the stream generated by the sender so that it covertly carries some special marking information. Some curious users communicating with the hidden server as a Tor client may attempt de-anonymization attacks to uncover the real identity of the hidden server by using this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03760v2-abstract-full').style.display = 'inline'; document.getElementById('2402.03760v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.03760v2-abstract-full" style="display: none;"> The network flow watermarking technique associates the two communicating parties by actively modifying certain characteristics of the stream generated by the sender so that it covertly carries some special marking information. Some curious users communicating with the hidden server as a Tor client may attempt de-anonymization attacks to uncover the real identity of the hidden server by using this technique. This compromises the privacy of the anonymized communication system. Therefore, we propose a defense scheme against flow watermarking. The scheme is based on deep neural networks and utilizes generative adversarial networks to convert the original Inter-Packet Delays (IPD) into new IPDs generated by the model. We also adopt the concept of adversarial attacks to ensure that the detector will produce an incorrect classification when detecting these new IPDs. This approach ensures that these IPDs are considered "clean", effectively covering the potential watermarks. This scheme is effective against time-based flow watermarking techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03760v2-abstract-full').style.display = 'none'; document.getElementById('2402.03760v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.01181">arXiv:2401.01181</a> <span> [<a href="https://arxiv.org/pdf/2401.01181">pdf</a>, <a href="https://arxiv.org/format/2401.01181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Query-Based Knowledge Sharing for Open-Vocabulary Multi-Label Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xuelin Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jian Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+D">Dongqi Tang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiawei Ge</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weijia Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiuxin Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.01181v1-abstract-short" style="display: inline;"> Identifying labels that did not appear during training, known as multi-label zero-shot learning, is a non-trivial task in computer vision. To this end, recent studies have attempted to explore the multi-modal knowledge of vision-language pre-training (VLP) models by knowledge distillation, allowing to recognize unseen labels in an open-vocabulary manner. However, experimental evidence shows that k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01181v1-abstract-full').style.display = 'inline'; document.getElementById('2401.01181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.01181v1-abstract-full" style="display: none;"> Identifying labels that did not appear during training, known as multi-label zero-shot learning, is a non-trivial task in computer vision. To this end, recent studies have attempted to explore the multi-modal knowledge of vision-language pre-training (VLP) models by knowledge distillation, allowing to recognize unseen labels in an open-vocabulary manner. However, experimental evidence shows that knowledge distillation is suboptimal and provides limited performance gain in unseen label prediction. In this paper, a novel query-based knowledge sharing paradigm is proposed to explore the multi-modal knowledge from the pretrained VLP model for open-vocabulary multi-label classification. Specifically, a set of learnable label-agnostic query tokens is trained to extract critical vision knowledge from the input image, and further shared across all labels, allowing them to select tokens of interest as visual clues for recognition. Besides, we propose an effective prompt pool for robust label embedding, and reformulate the standard ranking learning into a form of classification to allow the magnitude of feature vectors for matching, which both significantly benefit label recognition. Experimental results show that our framework significantly outperforms state-of-the-art methods on zero-shot task by 5.9% and 4.5% in mAP on the NUS-WIDE and Open Images, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01181v1-abstract-full').style.display = 'none'; document.getElementById('2401.01181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.17382">arXiv:2312.17382</a> <span> [<a href="https://arxiv.org/pdf/2312.17382">pdf</a>, <a href="https://arxiv.org/format/2312.17382">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Earth and Planetary Astrophysics">astro-ph.EP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Discovery of Small Ultra-short-period Planets Orbiting KG Dwarfs in Kepler Survey Using GPU Phase Folding and Deep Learning Detection System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaitlyn Wang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jian Ge</a>, <a href="/search/cs?searchtype=author&query=Willis%2C+K">Kevin Willis</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kevin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yinan Zhao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Quanquan Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.17382v3-abstract-short" style="display: inline;"> Of over 5,000 exoplanets identified so far, only a few hundred possess sub-Earth radii. The formation processes of these sub-Earths remain elusive, and acquiring additional samples is essential for investigating this unique population. In our study, we employ the GPFC method, a novel GPU Phase Folding algorithm combined with a Convolutional Neural Network, on Kepler photometry data. This method en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17382v3-abstract-full').style.display = 'inline'; document.getElementById('2312.17382v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.17382v3-abstract-full" style="display: none;"> Of over 5,000 exoplanets identified so far, only a few hundred possess sub-Earth radii. The formation processes of these sub-Earths remain elusive, and acquiring additional samples is essential for investigating this unique population. In our study, we employ the GPFC method, a novel GPU Phase Folding algorithm combined with a Convolutional Neural Network, on Kepler photometry data. This method enhances the transit search speed significantly over the traditional Box-fitting Least Squares method, allowing a complete search of the known Kepler KOI data within days using a commercial GPU card. To date, we have identified five new ultra-short-period planets (USPs): Kepler-158d, Kepler-963c, Kepler-879c, Kepler-1489c, and KOI-4978.02. Kepler-879c with a radius of $0.4 R_\oplus$ completes its orbit around a G dwarf in 0.646716 days. Kepler-158d with a radius of $0.43 R_\oplus$ orbits a K dwarf star every 0.645088 days. Kepler-1489c with a radius of $0.51 R_\oplus$ orbits a G dwarf in 0.680741 days. Kepler-963c with a radius of $0.6 R_\oplus$ revolves around a G dwarf in 0.919783 days, and KOI-4978.02 with a radius of $0.7 R_\oplus$ circles a G dwarf in 0.941967 days. Among our findings, Kepler-879c, Kepler-158d and Kepler-963c rank as the first, the third, the fourth smallest USPs identified to date. Notably, Kepler-158d stands as the smallest USP found orbiting K dwarfs while Kepler-963c, Kepler-879c, Kepler-1489c, and KOI-4978.02 are the smallest USPs found orbiting G dwarfs. Kepler-879c, Kepler-158d, Kepler-1489c, and KOI-4978.02 are among the smallest planets that are closest to their host stars, with orbits within 5 stellar radii. In addition, these discoveries highlight GPFC's promising capability in identifying small, new transiting exoplanets within photometry data from Kepler, TESS, and upcoming space transit missions, PLATO and ET. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17382v3-abstract-full').style.display = 'none'; document.getElementById('2312.17382v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 23 figures; To be published in the Monthly Notices of the Royal Astronomical Society (MNRAS)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.16204">arXiv:2312.16204</a> <span> [<a href="https://arxiv.org/pdf/2312.16204">pdf</a>, <a href="https://arxiv.org/format/2312.16204">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning from Mistakes: Iterative Prompt Relabeling for Text-to-Image Diffusion Model Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyan Chen</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiaxin Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.16204v3-abstract-short" style="display: inline;"> Diffusion models have shown impressive performance in many domains. However, the model's capability to follow natural language instructions (e.g., spatial relationships between objects, generating complex scenes) is still unsatisfactory. In this work, we propose Iterative Prompt Relabeling (IPR), a novel algorithm that aligns images to text through iterative image sampling and prompt relabeling wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.16204v3-abstract-full').style.display = 'inline'; document.getElementById('2312.16204v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.16204v3-abstract-full" style="display: none;"> Diffusion models have shown impressive performance in many domains. However, the model's capability to follow natural language instructions (e.g., spatial relationships between objects, generating complex scenes) is still unsatisfactory. In this work, we propose Iterative Prompt Relabeling (IPR), a novel algorithm that aligns images to text through iterative image sampling and prompt relabeling with feedback. IPR first samples a batch of images conditioned on the text, then relabels the text prompts of unmatched text-image pairs with classifier feedback. We conduct thorough experiments on SDv2 and SDXL, testing their capability to follow instructions on spatial relations. With IPR, we improved up to 15.22% (absolute improvement) on the challenging spatial relation VISOR benchmark, demonstrating superior performance compared to previous RL methods. Our code is publicly available at https://github.com/cxy000000/IPR-RLDF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.16204v3-abstract-full').style.display = 'none'; document.getElementById('2312.16204v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15614">arXiv:2312.15614</a> <span> [<a href="https://arxiv.org/pdf/2312.15614">pdf</a>, <a href="https://arxiv.org/format/2312.15614">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Evaluation of Parameter-Efficient Fine-Tuning on Software Engineering Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+W">Wentao Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jidong Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuanyi Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiaoyu Shen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Liguo Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bin Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15614v1-abstract-short" style="display: inline;"> Pre-trained models (PTMs) have achieved great success in various Software Engineering (SE) downstream tasks following the ``pre-train then fine-tune'' paradigm. As fully fine-tuning all parameters of PTMs can be computationally expensive, a widely used solution is parameter-efficient fine-tuning (PEFT), which freezes PTMs while introducing extra parameters. Though work has been done to test PEFT m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15614v1-abstract-full').style.display = 'inline'; document.getElementById('2312.15614v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15614v1-abstract-full" style="display: none;"> Pre-trained models (PTMs) have achieved great success in various Software Engineering (SE) downstream tasks following the ``pre-train then fine-tune'' paradigm. As fully fine-tuning all parameters of PTMs can be computationally expensive, a widely used solution is parameter-efficient fine-tuning (PEFT), which freezes PTMs while introducing extra parameters. Though work has been done to test PEFT methods in the SE field, a comprehensive evaluation is still lacking. This paper aims to fill in this gap by evaluating the effectiveness of five PEFT methods on eight PTMs and four SE downstream tasks. For different tasks and PEFT methods, we seek answers to the following research questions: 1) Is it more effective to use PTMs trained specifically on source code, or is it sufficient to use PTMs trained on natural language text? 2) What is the impact of varying model sizes? 3) How does the model architecture affect the performance? Besides effectiveness, we also discuss the efficiency of PEFT methods, concerning the costs of required training time and GPU resource consumption. We hope that our findings can provide a deeper understanding of PEFT methods on various PTMs and SE downstream tasks. All the codes and data are available at \url{https://github.com/zwtnju/PEFT.git}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15614v1-abstract-full').style.display = 'none'; document.getElementById('2312.15614v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.12155">arXiv:2312.12155</a> <span> [<a href="https://arxiv.org/pdf/2312.12155">pdf</a>, <a href="https://arxiv.org/format/2312.12155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Balanced Alignment: Modal-Enhanced Semantic Modeling for Video Moment Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhihang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Hongtao Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pandeng Li</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiannan Ge</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sun-Ao Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+G">Guoqing Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.12155v1-abstract-short" style="display: inline;"> Video Moment Retrieval (VMR) aims to retrieve temporal segments in untrimmed videos corresponding to a given language query by constructing cross-modal alignment strategies. However, these existing strategies are often sub-optimal since they ignore the modality imbalance problem, \textit{i.e.}, the semantic richness inherent in videos far exceeds that of a given limited-length sentence. Therefore,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12155v1-abstract-full').style.display = 'inline'; document.getElementById('2312.12155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.12155v1-abstract-full" style="display: none;"> Video Moment Retrieval (VMR) aims to retrieve temporal segments in untrimmed videos corresponding to a given language query by constructing cross-modal alignment strategies. However, these existing strategies are often sub-optimal since they ignore the modality imbalance problem, \textit{i.e.}, the semantic richness inherent in videos far exceeds that of a given limited-length sentence. Therefore, in pursuit of better alignment, a natural idea is enhancing the video modality to filter out query-irrelevant semantics, and enhancing the text modality to capture more segment-relevant knowledge. In this paper, we introduce Modal-Enhanced Semantic Modeling (MESM), a novel framework for more balanced alignment through enhancing features at two levels. First, we enhance the video modality at the frame-word level through word reconstruction. This strategy emphasizes the portions associated with query words in frame-level features while suppressing irrelevant parts. Therefore, the enhanced video contains less redundant semantics and is more balanced with the textual modality. Second, we enhance the textual modality at the segment-sentence level by learning complementary knowledge from context sentences and ground-truth segments. With the knowledge added to the query, the textual modality thus maintains more meaningful semantics and is more balanced with the video modality. By implementing two levels of MESM, the semantic information from both modalities is more balanced to align, thereby bridging the modality gap. Experiments on three widely used benchmarks, including the out-of-distribution settings, show that the proposed framework achieves a new start-of-the-art performance with notable generalization ability (e.g., 4.42% and 7.69% average gains of R1@0.7 on Charades-STA and Charades-CG). The code will be available at https://github.com/lntzm/MESM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12155v1-abstract-full').style.display = 'none'; document.getElementById('2312.12155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04160">arXiv:2312.04160</a> <span> [<a href="https://arxiv.org/pdf/2312.04160">pdf</a>, <a href="https://arxiv.org/format/2312.04160">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Text as Image: Learning Transferable Adapter for Multi-Label Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xuelin Zhu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiuxin Cao</a>, <a href="/search/cs?searchtype=author&query=liu%2C+J">Jian liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+D">Dongqi Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Furong Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weijia Liu</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiawei Ge</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qingpei Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04160v1-abstract-short" style="display: inline;"> Pre-trained vision-language models have notably accelerated progress of open-world concept recognition. Their impressive zero-shot ability has recently been transferred to multi-label image classification via prompt tuning, enabling to discover novel labels in an open-vocabulary manner. However, this paradigm suffers from non-trivial training costs, and becomes computationally prohibitive for a la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04160v1-abstract-full').style.display = 'inline'; document.getElementById('2312.04160v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04160v1-abstract-full" style="display: none;"> Pre-trained vision-language models have notably accelerated progress of open-world concept recognition. Their impressive zero-shot ability has recently been transferred to multi-label image classification via prompt tuning, enabling to discover novel labels in an open-vocabulary manner. However, this paradigm suffers from non-trivial training costs, and becomes computationally prohibitive for a large number of candidate labels. To address this issue, we note that vision-language pre-training aligns images and texts in a unified embedding space, making it potential for an adapter network to identify labels in visual modality while be trained in text modality. To enhance such cross-modal transfer ability, a simple yet effective method termed random perturbation is proposed, which enables the adapter to search for potential visual embeddings by perturbing text embeddings with noise during training, resulting in better performance in visual modality. Furthermore, we introduce an effective approach to employ large language models for multi-label instruction-following text generation. In this way, a fully automated pipeline for visual label recognition is developed without relying on any manual data. Extensive experiments on public benchmarks show the superiority of our method in various multi-label classification tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04160v1-abstract-full').style.display = 'none'; document.getElementById('2312.04160v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.02249">arXiv:2312.02249</a> <span> [<a href="https://arxiv.org/pdf/2312.02249">pdf</a>, <a href="https://arxiv.org/format/2312.02249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Recursive Visual Programming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiaxin Ge</a>, <a href="/search/cs?searchtype=author&query=Subramanian%2C+S">Sanjay Subramanian</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Baifeng Shi</a>, <a href="/search/cs?searchtype=author&query=Herzig%2C+R">Roei Herzig</a>, <a href="/search/cs?searchtype=author&query=Darrell%2C+T">Trevor Darrell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.02249v2-abstract-short" style="display: inline;"> Visual Programming (VP) has emerged as a powerful framework for Visual Question Answering (VQA). By generating and executing bespoke code for each question, these methods demonstrate impressive compositional and reasoning capabilities, especially in few-shot and zero-shot scenarios. However, existing VP methods generate all code in a single function, resulting in code that is suboptimal in terms o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02249v2-abstract-full').style.display = 'inline'; document.getElementById('2312.02249v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.02249v2-abstract-full" style="display: none;"> Visual Programming (VP) has emerged as a powerful framework for Visual Question Answering (VQA). By generating and executing bespoke code for each question, these methods demonstrate impressive compositional and reasoning capabilities, especially in few-shot and zero-shot scenarios. However, existing VP methods generate all code in a single function, resulting in code that is suboptimal in terms of both accuracy and interpretability. Inspired by human coding practices, we propose Recursive Visual Programming (RVP), which simplifies generated routines, provides more efficient problem solving, and can manage more complex data structures. RVP is inspired by human coding practices and approaches VQA tasks with an iterative recursive code generation approach, allowing decomposition of complicated problems into smaller parts. Notably, RVP is capable of dynamic type assignment, i.e., as the system recursively generates a new piece of code, it autonomously determines the appropriate return type and crafts the requisite code to generate that output. We show RVP's efficacy through extensive experiments on benchmarks including VSR, COVR, GQA, and NextQA, underscoring the value of adopting human-like recursive and modular programming techniques for solving VQA tasks through coding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02249v2-abstract-full').style.display = 'none'; document.getElementById('2312.02249v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.02063">arXiv:2312.02063</a> <span> [<a href="https://arxiv.org/pdf/2312.02063">pdf</a>, <a href="https://arxiv.org/format/2312.02063">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Earth and Planetary Astrophysics">astro-ph.EP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1093/mnras/stae245">10.1093/mnras/stae245 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> The GPU Phase Folding and Deep Learning Method for Detecting Exoplanet Transits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaitlyn Wang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jian Ge</a>, <a href="/search/cs?searchtype=author&query=Willis%2C+K">Kevin Willis</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kevin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yinan Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.02063v2-abstract-short" style="display: inline;"> This paper presents GPFC, a novel Graphics Processing Unit (GPU) Phase Folding and Convolutional Neural Network (CNN) system to detect exoplanets using the transit method. We devise a fast folding algorithm parallelized on a GPU to amplify low signal-to-noise ratio transit signals, allowing a search at high precision and speed. A CNN trained on two million synthetic light curves reports a score in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02063v2-abstract-full').style.display = 'inline'; document.getElementById('2312.02063v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.02063v2-abstract-full" style="display: none;"> This paper presents GPFC, a novel Graphics Processing Unit (GPU) Phase Folding and Convolutional Neural Network (CNN) system to detect exoplanets using the transit method. We devise a fast folding algorithm parallelized on a GPU to amplify low signal-to-noise ratio transit signals, allowing a search at high precision and speed. A CNN trained on two million synthetic light curves reports a score indicating the likelihood of a planetary signal at each period. While the GPFC method has broad applicability across period ranges, this research specifically focuses on detecting ultra-short-period planets with orbital periods less than one day. GPFC improves on speed by three orders of magnitude over the predominant Box-fitting Least Squares (BLS) method. Our simulation results show GPFC achieves $97%$ training accuracy, higher true positive rate at the same false positive rate of detection, and higher precision at the same recall rate when compared to BLS. GPFC recovers $100\%$ of known ultra-short-period planets in $\textit{Kepler}$ light curves from a blind search. These results highlight the promise of GPFC as an alternative approach to the traditional BLS algorithm for finding new transiting exoplanets in data taken with $\textit{Kepler}$ and other space transit missions such as K2, TESS and future PLATO and Earth 2.0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02063v2-abstract-full').style.display = 'none'; document.getElementById('2312.02063v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 19 figures; Accepted for publication in the peer-reviewed journal, Monthly Notices of the Royal Astronomical Society (MNRAS), on January 20, 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> MNRAS, 528, 4053 (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.17085">arXiv:2311.17085</a> <span> [<a href="https://arxiv.org/pdf/2311.17085">pdf</a>, <a href="https://arxiv.org/format/2311.17085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Visual Cues: Synchronously Exploring Target-Centric Semantics for Vision-Language Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiawei Ge</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiangmei Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiuxin Cao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xuelin Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.17085v2-abstract-short" style="display: inline;"> Single object tracking aims to locate one specific target in video sequences, given its initial state. Classical trackers rely solely on visual cues, restricting their ability to handle challenges such as appearance variations, ambiguity, and distractions. Hence, Vision-Language (VL) tracking has emerged as a promising approach, incorporating language descriptions to directly provide high-level se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17085v2-abstract-full').style.display = 'inline'; document.getElementById('2311.17085v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.17085v2-abstract-full" style="display: none;"> Single object tracking aims to locate one specific target in video sequences, given its initial state. Classical trackers rely solely on visual cues, restricting their ability to handle challenges such as appearance variations, ambiguity, and distractions. Hence, Vision-Language (VL) tracking has emerged as a promising approach, incorporating language descriptions to directly provide high-level semantics and enhance tracking performance. However, current VL trackers have not fully exploited the power of VL learning, as they suffer from limitations such as heavily relying on off-the-shelf backbones for feature extraction, ineffective VL fusion designs, and the absence of VL-related loss functions. Consequently, we present a novel tracker that progressively explores target-centric semantics for VL tracking. Specifically, we propose the first Synchronous Learning Backbone (SLB) for VL tracking, which consists of two novel modules: the Target Enhance Module (TEM) and the Semantic Aware Module (SAM). These modules enable the tracker to perceive target-related semantics and comprehend the context of both visual and textual modalities at the same pace, facilitating VL feature extraction and fusion at different semantic levels. Moreover, we devise the dense matching loss to further strengthen multi-modal representation learning. Extensive experiments on VL tracking datasets demonstrate the superiority and effectiveness of our methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17085v2-abstract-full').style.display = 'none'; document.getElementById('2311.17085v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ge%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ge%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ge%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Ge%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>