CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 174 results for author: <span class="mathjax">Nie, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Nie%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Nie, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Nie%2C+Y&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Nie, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Nie%2C+Y&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Nie%2C+Y&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Nie%2C+Y&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Nie%2C+Y&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Nie%2C+Y&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14948">arXiv:2502.14948</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.14948">pdf</a>, <a href="https://arxiv.org/format/2502.14948">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Learning to Solve and Verify: A Self-Play Framework for Code and Test Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zi Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+S">Sheng Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Shang%2C+J">Jingbo Shang</a>, <a href="/search/cs?searchtype=author&amp;query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yixin Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14948v2-abstract-short" style="display: inline;"> Recent advances in large language models (LLMs) have improved their performance on coding benchmarks. However, improvement is plateauing due to the exhaustion of readily available high-quality data. Prior work has shown the potential of synthetic self-instruct data, but naively training on a model&#39;s own outputs can cause error accumulation, especially in coding tasks, where generalization may coll&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14948v2-abstract-full').style.display = 'inline'; document.getElementById('2502.14948v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14948v2-abstract-full" style="display: none;"> Recent advances in large language models (LLMs) have improved their performance on coding benchmarks. However, improvement is plateauing due to the exhaustion of readily available high-quality data. Prior work has shown the potential of synthetic self-instruct data, but naively training on a model&#39;s own outputs can cause error accumulation, especially in coding tasks, where generalization may collapse due to overly simple or erroneous training data, highlighting the need for rigorous quality checks on synthetic data. In this work, we explore an effective approach whereby the model itself verifies the correctness of its own data. We thus propose Sol-Ver, a self-play solver-verifier framework that jointly improves a single model&#39;s code and test generation capacity. By iteratively refining code (LLM-as-a-solver) and tests (LLM-as-a-verifier) together, we boost both capabilities without relying on human annotations or larger teacher models. Experiments with the Llama 3.1 8B model demonstrate substantial performance enhancements, achieving average relative improvements of 19.63% in code generation and 17.49% in test generation on MBPP and LiveCodeBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14948v2-abstract-full').style.display = 'none'; document.getElementById('2502.14948v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12640">arXiv:2502.12640</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.12640">pdf</a>, <a href="https://arxiv.org/format/2502.12640">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RecDreamer: Consistent Text-to-3D Generation via Uniform Score Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+C">Chenxi Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yihong Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Bangzhen Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+X">Xuemiao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yongwei Nie</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+S">Shengfeng He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12640v1-abstract-short" style="display: inline;"> Current text-to-3D generation methods based on score distillation often suffer from geometric inconsistencies, leading to repeated patterns across different poses of 3D assets. This issue, known as the Multi-Face Janus problem, arises because existing methods struggle to maintain consistency across varying poses and are biased toward a canonical pose. While recent work has improved pose control an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12640v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12640v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12640v1-abstract-full" style="display: none;"> Current text-to-3D generation methods based on score distillation often suffer from geometric inconsistencies, leading to repeated patterns across different poses of 3D assets. This issue, known as the Multi-Face Janus problem, arises because existing methods struggle to maintain consistency across varying poses and are biased toward a canonical pose. While recent work has improved pose control and approximation, these efforts are still limited by this inherent bias, which skews the guidance during generation. To address this, we propose a solution called RecDreamer, which reshapes the underlying data distribution to achieve a more consistent pose representation. The core idea behind our method is to rectify the prior distribution, ensuring that pose variation is uniformly distributed rather than biased toward a canonical form. By modifying the prescribed distribution through an auxiliary function, we can reconstruct the density of the distribution to ensure compliance with specific marginal constraints. In particular, we ensure that the marginal distribution of poses follows a uniform distribution, thereby eliminating the biases introduced by the prior knowledge. We incorporate this rectified data distribution into existing score distillation algorithms, a process we refer to as uniform score distillation. To efficiently compute the posterior distribution required for the auxiliary function, RecDreamer introduces a training-free classifier that estimates pose categories in a plug-and-play manner. Additionally, we utilize various approximation techniques for noisy states, significantly improving system performance. Our experimental results demonstrate that RecDreamer effectively mitigates the Multi-Face Janus problem, leading to more consistent 3D asset generation across different poses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12640v1-abstract-full').style.display = 'none'; document.getElementById('2502.12640v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06220">arXiv:2502.06220</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.06220">pdf</a>, <a href="https://arxiv.org/format/2502.06220">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> FunduSAM: A Specialized Deep Learning Model for Enhanced Optic Disc and Cup Segmentation in Fundus Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jinchen Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yongwei Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+F">Fei Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+W">Wenxiong Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+H">Hongmin Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06220v1-abstract-short" style="display: inline;"> The Segment Anything Model (SAM) has gained popularity as a versatile image segmentation method, thanks to its strong generalization capabilities across various domains. However, when applied to optic disc (OD) and optic cup (OC) segmentation tasks, SAM encounters challenges due to the complex structures, low contrast, and blurred boundaries typical of fundus images, leading to suboptimal performa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06220v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06220v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06220v1-abstract-full" style="display: none;"> The Segment Anything Model (SAM) has gained popularity as a versatile image segmentation method, thanks to its strong generalization capabilities across various domains. However, when applied to optic disc (OD) and optic cup (OC) segmentation tasks, SAM encounters challenges due to the complex structures, low contrast, and blurred boundaries typical of fundus images, leading to suboptimal performance. To overcome these challenges, we introduce a novel model, FunduSAM, which incorporates several Adapters into SAM to create a deep network specifically designed for OD and OC segmentation. The FunduSAM utilizes Adapter into each transformer block after encoder for parameter fine-tuning (PEFT). It enhances SAM&#39;s feature extraction capabilities by designing a Convolutional Block Attention Module (CBAM), addressing issues related to blurred boundaries and low contrast. Given the unique requirements of OD and OC segmentation, polar transformation is used to convert the original fundus OD images into a format better suited for training and evaluating FunduSAM. A joint loss is used to achieve structure preservation between the OD and OC, while accurate segmentation. Extensive experiments on the REFUGE dataset, comprising 1,200 fundus images, demonstrate the superior performance of FunduSAM compared to five mainstream approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06220v1-abstract-full').style.display = 'none'; document.getElementById('2502.06220v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02196">arXiv:2502.02196</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.02196">pdf</a>, <a href="https://arxiv.org/format/2502.02196">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Ensemble Learning for Cross-View Isolated Sign Language Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yiqi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+Z">Zhangling Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+P">Peng Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhiliang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuwei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yanyan Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02196v1-abstract-short" style="display: inline;"> In this paper, we present our solution to the Cross-View Isolated Sign Language Recognition (CV-ISLR) challenge held at WWW 2025. CV-ISLR addresses a critical issue in traditional Isolated Sign Language Recognition (ISLR), where existing datasets predominantly capture sign language videos from a frontal perspective, while real-world camera angles often vary. To accurately recognize sign language f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02196v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02196v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02196v1-abstract-full" style="display: none;"> In this paper, we present our solution to the Cross-View Isolated Sign Language Recognition (CV-ISLR) challenge held at WWW 2025. CV-ISLR addresses a critical issue in traditional Isolated Sign Language Recognition (ISLR), where existing datasets predominantly capture sign language videos from a frontal perspective, while real-world camera angles often vary. To accurately recognize sign language from different viewpoints, models must be capable of understanding gestures from multiple angles, making cross-view recognition challenging. To address this, we explore the advantages of ensemble learning, which enhances model robustness and generalization across diverse views. Our approach, built on a multi-dimensional Video Swin Transformer model, leverages this ensemble strategy to achieve competitive performance. Finally, our solution ranked 3rd in both the RGB-based ISLR and RGB-D-based ISLR tracks, demonstrating the effectiveness in handling the challenges of cross-view recognition. The code is available at: https://github.com/Jiafei127/CV_ISLR_WWW2025. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02196v1-abstract-full').style.display = 'none'; document.getElementById('2502.02196v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3rd Place in Cross-View Isolated Sign Language Recognition Challenge at WWW 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15579">arXiv:2501.15579</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.15579">pdf</a>, <a href="https://arxiv.org/format/2501.15579">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ConceptCLIP: Towards Trustworthy Medical AI via Concept-Enhanced Contrastive Langauge-Image Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuxiang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+S">Sunan He</a>, <a href="/search/cs?searchtype=author&amp;query=Bie%2C+Y">Yequan Bie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yihui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhixuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15579v1-abstract-short" style="display: inline;"> Trustworthiness is essential for the precise and interpretable application of artificial intelligence (AI) in medical imaging. Traditionally, precision and interpretability have been addressed as separate tasks, namely medical image analysis and explainable AI, each developing its own models independently. In this study, for the first time, we investigate the development of a unified medical visio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15579v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15579v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15579v1-abstract-full" style="display: none;"> Trustworthiness is essential for the precise and interpretable application of artificial intelligence (AI) in medical imaging. Traditionally, precision and interpretability have been addressed as separate tasks, namely medical image analysis and explainable AI, each developing its own models independently. In this study, for the first time, we investigate the development of a unified medical vision-language pre-training model that can achieve both accurate analysis and interpretable understanding of medical images across various modalities. To build the model, we construct MedConcept-23M, a large-scale dataset comprising 23 million medical image-text pairs extracted from 6.2 million scientific articles, enriched with concepts from the Unified Medical Language System (UMLS). Based on MedConcept-23M, we introduce ConceptCLIP, a medical AI model utilizing concept-enhanced contrastive language-image pre-training. The pre-training of ConceptCLIP involves two primary components: image-text alignment learning (IT-Align) and patch-concept alignment learning (PC-Align). This dual alignment strategy enhances the model&#39;s capability to associate specific image regions with relevant concepts, thereby improving both the precision of analysis and the interpretability of the AI system. We conducted extensive experiments on 5 diverse types of medical image analysis tasks, spanning 51 subtasks across 10 image modalities, with the broadest range of downstream tasks. The results demonstrate the effectiveness of the proposed vision-language pre-training model. Further explainability analysis across 6 modalities reveals that ConceptCLIP achieves superior performance, underscoring its robust ability to advance explainable AI in medical imaging. These findings highlight ConceptCLIP&#39;s capability in promoting trustworthy AI in the field of medicine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15579v1-abstract-full').style.display = 'none'; document.getElementById('2501.15579v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14249">arXiv:2501.14249</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.14249">pdf</a>, <a href="https://arxiv.org/format/2501.14249">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Humanity&#39;s Last Exam </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Phan%2C+L">Long Phan</a>, <a href="/search/cs?searchtype=author&amp;query=Gatti%2C+A">Alice Gatti</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+Z">Ziwen Han</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+N">Nathaniel Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J">Josephina Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hugh Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C+B+C">Chen Bo Calvin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Shaaban%2C+M">Mohamed Shaaban</a>, <a href="/search/cs?searchtype=author&amp;query=Ling%2C+J">John Ling</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+S">Sean Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+M">Michael Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Agrawal%2C+A">Anish Agrawal</a>, <a href="/search/cs?searchtype=author&amp;query=Chopra%2C+A">Arnav Chopra</a>, <a href="/search/cs?searchtype=author&amp;query=Khoja%2C+A">Adam Khoja</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+R">Ryan Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+R">Richard Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Hausenloy%2C+J">Jason Hausenloy</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+O">Oliver Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Mazeika%2C+M">Mantas Mazeika</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Tung Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Anderson%2C+D">Daron Anderson</a>, <a href="/search/cs?searchtype=author&amp;query=Shah%2C+I+A">Imad Ali Shah</a>, <a href="/search/cs?searchtype=author&amp;query=Doroshenko%2C+M">Mikhail Doroshenko</a>, <a href="/search/cs?searchtype=author&amp;query=Stokes%2C+A+C">Alun Cennyth Stokes</a>, <a href="/search/cs?searchtype=author&amp;query=Mahmood%2C+M">Mobeen Mahmood</a> , et al. (709 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14249v5-abstract-short" style="display: inline;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity&#39;s Last Exam (HLE), a multi-modal benchmark at the frontier of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v5-abstract-full').style.display = 'inline'; document.getElementById('2501.14249v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14249v5-abstract-full" style="display: none;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity&#39;s Last Exam (HLE), a multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage. HLE consists of 2,700 questions across dozens of subjects, including mathematics, humanities, and the natural sciences. HLE is developed globally by subject-matter experts and consists of multiple-choice and short-answer questions suitable for automated grading. Each question has a known solution that is unambiguous and easily verifiable, but cannot be quickly answered via internet retrieval. State-of-the-art LLMs demonstrate low accuracy and calibration on HLE, highlighting a significant gap between current LLM capabilities and the expert human frontier on closed-ended academic questions. To inform research and policymaking upon a clear understanding of model capabilities, we publicly release HLE at https://lastexam.ai. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v5-abstract-full').style.display = 'none'; document.getElementById('2501.14249v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09530">arXiv:2412.09530</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.09530">pdf</a>, <a href="https://arxiv.org/format/2412.09530">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-VLM: Simple Dynamic Visual Token Compression for VideoLLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuxiang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+Y">Yongjie Ye</a>, <a href="/search/cs?searchtype=author&amp;query=GuanYu%2C+D">Deng GuanYu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yanjie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shuai Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+H">Haiyang Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jinghui Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Can Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09530v1-abstract-short" style="display: inline;"> The application of Large Vision-Language Models (LVLMs) for analyzing images and videos is an exciting and rapidly evolving field. In recent years, we&#39;ve seen significant growth in high-quality image-text datasets for fine-tuning image understanding, but there is still a lack of comparable datasets for videos. Additionally, many VideoLLMs are extensions of single-image VLMs, which may not efficien&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09530v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09530v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09530v1-abstract-full" style="display: none;"> The application of Large Vision-Language Models (LVLMs) for analyzing images and videos is an exciting and rapidly evolving field. In recent years, we&#39;ve seen significant growth in high-quality image-text datasets for fine-tuning image understanding, but there is still a lack of comparable datasets for videos. Additionally, many VideoLLMs are extensions of single-image VLMs, which may not efficiently handle the complexities of longer videos. In this study, we introduce a large-scale synthetic dataset created from proprietary models, using carefully designed prompts to tackle a wide range of questions. We also explore a dynamic visual token compression architecture that strikes a balance between computational efficiency and performance. Our proposed \model{} achieves state-of-the-art results across various video tasks and shows impressive generalization, setting new baselines in multi-image understanding. Notably, \model{} delivers an absolute improvement of 2.7\% over LLaVA-OneVision on VideoMME and 10.7\% on MuirBench. Codes are available at https://github.com/Hon-Wong/ByteVideoLLM <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09530v1-abstract-full').style.display = 'none'; document.getElementById('2412.09530v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06482">arXiv:2412.06482</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.06482">pdf</a>, <a href="https://arxiv.org/format/2412.06482">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Cardinality-Constrained Approach to Combinatorial Bilevel Congestion Pricing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+L">Lei Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiayang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y+M">Yu Marco Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+J">Jun Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06482v2-abstract-short" style="display: inline;"> Combinatorial bilevel congestion pricing (CBCP), a variant of the discrete network design problem, seeks to minimize the total travel time experienced by all travelers in a road network, by strategically selecting toll locations and determining the corresponding charges. Conventional wisdom suggests that these problems are intractable since they have to be formulated and solved with a significant&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06482v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06482v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06482v2-abstract-full" style="display: none;"> Combinatorial bilevel congestion pricing (CBCP), a variant of the discrete network design problem, seeks to minimize the total travel time experienced by all travelers in a road network, by strategically selecting toll locations and determining the corresponding charges. Conventional wisdom suggests that these problems are intractable since they have to be formulated and solved with a significant number of integer variables. Here, we devise a scalable local algorithm for the CBCP problem that guarantees convergence to a Kuhn-Tucker-Karush point. Our approach is novel in that it eliminates the use of integer variables altogether, instead introducing a cardinality constraint that limits the number of toll locations to a user-specified upper bound. The resulting bilevel program with the cardinality constraint is then transformed into a block-separable, single-level optimization problem that can be solved efficiently after penalization and decomposition. We are able to apply the algorithm to solve, in about 20 minutes, a CBCP instance with up to 3,000 links, of which hundreds can be tolled. To the best of our knowledge, no existing algorithm can solve CBCP problems at such a scale while providing any assurance of convergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06482v2-abstract-full').style.display = 'none'; document.getElementById('2412.06482v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05734">arXiv:2412.05734</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.05734">pdf</a>, <a href="https://arxiv.org/format/2412.05734">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PrivAgent: Agentic-based Red-teaming for LLM Privacy Leakage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuzhou Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Ye Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xian Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+X">Xuandong Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+W">Wenbo Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+D">Dawn Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05734v1-abstract-short" style="display: inline;"> Recent studies have discovered that LLMs have serious privacy leakage concerns, where an LLM may be fooled into outputting private information under carefully crafted adversarial prompts. These risks include leaking system prompts, personally identifiable information, training data, and model parameters. Most existing red-teaming approaches for privacy leakage rely on humans to craft the adversari&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05734v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05734v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05734v1-abstract-full" style="display: none;"> Recent studies have discovered that LLMs have serious privacy leakage concerns, where an LLM may be fooled into outputting private information under carefully crafted adversarial prompts. These risks include leaking system prompts, personally identifiable information, training data, and model parameters. Most existing red-teaming approaches for privacy leakage rely on humans to craft the adversarial prompts. A few automated methods are proposed for system prompt extraction, but they cannot be applied to more severe risks (e.g., training data extraction) and have limited effectiveness even for system prompt extraction. In this paper, we propose PrivAgent, a novel black-box red-teaming framework for LLM privacy leakage. We formulate different risks as a search problem with a unified attack goal. Our framework trains an open-source LLM through reinforcement learning as the attack agent to generate adversarial prompts for different target models under different risks. We propose a novel reward function to provide effective and fine-grained rewards for the attack agent. Finally, we introduce customizations to better fit our general framework to system prompt extraction and training data extraction. Through extensive evaluations, we first show that PrivAgent outperforms existing automated methods in system prompt leakage against six popular LLMs. Notably, our approach achieves a 100% success rate in extracting system prompts from real-world applications in OpenAI&#39;s GPT Store. We also show PrivAgent&#39;s effectiveness in extracting training data from an open-source LLM with a success rate of 5.9%. We further demonstrate PrivAgent&#39;s effectiveness in evading the existing guardrail defense and its helpfulness in enabling better safety alignment. Finally, we validate our customized designs through a detailed ablation study. We release our code here https://github.com/rucnyz/RedAgent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05734v1-abstract-full').style.display = 'none'; document.getElementById('2412.05734v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17949">arXiv:2411.17949</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17949">pdf</a>, <a href="https://arxiv.org/format/2411.17949">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ROICtrl: Boosting Instance Control for Visual Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gu%2C+Y">Yuchao Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yipin Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+Y">Yunfan Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yixin Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+L">Licheng Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+K+Q">Kevin Qinghong Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Shou%2C+M+Z">Mike Zheng Shou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17949v1-abstract-short" style="display: inline;"> Natural language often struggles to accurately associate positional and attribute information with multiple instances, which limits current text-based visual generation models to simpler compositions featuring only a few dominant instances. To address this limitation, this work enhances diffusion models by introducing regional instance control, where each instance is governed by a bounding box pai&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17949v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17949v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17949v1-abstract-full" style="display: none;"> Natural language often struggles to accurately associate positional and attribute information with multiple instances, which limits current text-based visual generation models to simpler compositions featuring only a few dominant instances. To address this limitation, this work enhances diffusion models by introducing regional instance control, where each instance is governed by a bounding box paired with a free-form caption. Previous methods in this area typically rely on implicit position encoding or explicit attention masks to separate regions of interest (ROIs), resulting in either inaccurate coordinate injection or large computational overhead. Inspired by ROI-Align in object detection, we introduce a complementary operation called ROI-Unpool. Together, ROI-Align and ROI-Unpool enable explicit, efficient, and accurate ROI manipulation on high-resolution feature maps for visual generation. Building on ROI-Unpool, we propose ROICtrl, an adapter for pretrained diffusion models that enables precise regional instance control. ROICtrl is compatible with community-finetuned diffusion models, as well as with existing spatial-based add-ons (\eg, ControlNet, T2I-Adapter) and embedding-based add-ons (\eg, IP-Adapter, ED-LoRA), extending their applications to multi-instance generation. Experiments show that ROICtrl achieves superior performance in regional instance control while significantly reducing computational costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17949v1-abstract-full').style.display = 'none'; document.getElementById('2411.17949v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page at https://roictrl.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11930">arXiv:2411.11930</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11930">pdf</a>, <a href="https://arxiv.org/format/2411.11930">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AtomThink: A Slow Thinking Framework for Multimodal Mathematical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+K">Kun Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhili Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zihao Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yunshuang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+R">Runhui Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+H">Haoxiang Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hanhui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Weiran Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Y">Yihan Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+J">Jianhua Han</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11930v3-abstract-short" style="display: inline;"> In this paper, we address the challenging task of multimodal mathematical reasoning by incorporating the ability of ``slow thinking&#34; into multimodal large language models (MLLMs). Contrary to existing methods that rely on direct or fast thinking, our key idea is to construct long chains of thought (CoT) consisting of atomic actions in a step-by-step manner, guiding MLLMs to perform complex reasoni&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11930v3-abstract-full').style.display = 'inline'; document.getElementById('2411.11930v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11930v3-abstract-full" style="display: none;"> In this paper, we address the challenging task of multimodal mathematical reasoning by incorporating the ability of ``slow thinking&#34; into multimodal large language models (MLLMs). Contrary to existing methods that rely on direct or fast thinking, our key idea is to construct long chains of thought (CoT) consisting of atomic actions in a step-by-step manner, guiding MLLMs to perform complex reasoning. To this end, we design a novel AtomThink framework composed of three key modules: (i) a CoT annotation engine that automatically generates high-quality CoT annotations to address the lack of high-quality visual mathematical data; (ii) an atomic step fine-tuning strategy that jointly optimizes an MLLM and a policy reward model (PRM) for step-wise reasoning; and (iii) four different search strategies that can be applied with the PRM to complete reasoning. Additionally, we propose AtomMATH, a large-scale multimodal dataset of long CoTs, and an atomic capability evaluation metric for mathematical tasks. Extensive experimental results show that the proposed AtomThink significantly improves the performance of baseline MLLMs, achieving approximately 50\% relative accuracy gains on MathVista and 120\% on MathVerse. To support the advancement of multimodal slow-thinking models, we will make our code and dataset publicly available on https://github.com/Quinn777/AtomThink. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11930v3-abstract-full').style.display = 'none'; document.getElementById('2411.11930v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11543">arXiv:2411.11543</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11543">pdf</a>, <a href="https://arxiv.org/format/2411.11543">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PSA-VLM: Enhancing Vision-Language Model Safety through Progressive Concept-Bottleneck-Driven Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhendong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuanbi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Y">Yingshui Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yue%2C+X">Xiangyu Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Q">Qiushi Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chongjun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xiaoyong Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11543v4-abstract-short" style="display: inline;"> Benefiting from the powerful capabilities of Large Language Models (LLMs), pre-trained visual encoder models connected to LLMs form Vision Language Models (VLMs). However, recent research shows that the visual modality in VLMs is highly vulnerable, allowing attackers to bypass safety alignment in LLMs through visually transmitted content, launching harmful attacks. To address this challenge, we pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11543v4-abstract-full').style.display = 'inline'; document.getElementById('2411.11543v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11543v4-abstract-full" style="display: none;"> Benefiting from the powerful capabilities of Large Language Models (LLMs), pre-trained visual encoder models connected to LLMs form Vision Language Models (VLMs). However, recent research shows that the visual modality in VLMs is highly vulnerable, allowing attackers to bypass safety alignment in LLMs through visually transmitted content, launching harmful attacks. To address this challenge, we propose a progressive concept-based alignment strategy, PSA-VLM, which incorporates safety modules as concept bottlenecks to enhance visual modality safety alignment. By aligning model predictions with specific safety concepts, we improve defenses against risky images, enhancing explainability and controllability while minimally impacting general performance. Our method is obtained through two-stage training. The low computational cost of the first stage brings very effective performance improvement, and the fine-tuning of the language model in the second stage further improves the safety performance. Our method achieves state-of-the-art results on popular VLM safety benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11543v4-abstract-full').style.display = 'none'; document.getElementById('2411.11543v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2405.13581</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03047">arXiv:2411.03047</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03047">pdf</a>, <a href="https://arxiv.org/format/2411.03047">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> GarVerseLOD: High-Fidelity 3D Garment Reconstruction from a Single In-the-Wild Image using a Dataset with Levels of Details </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Z">Zhongjin Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Haolin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenghong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+W">Wanghao Du</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Z">Zirong Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+W">Wanhu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yinyu Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Weikai Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xiaoguang Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03047v1-abstract-short" style="display: inline;"> Neural implicit functions have brought impressive advances to the state-of-the-art of clothed human digitization from multiple or even single images. However, despite the progress, current arts still have difficulty generalizing to unseen images with complex cloth deformation and body poses. In this work, we present GarVerseLOD, a new dataset and framework that paves the way to achieving unprecede&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03047v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03047v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03047v1-abstract-full" style="display: none;"> Neural implicit functions have brought impressive advances to the state-of-the-art of clothed human digitization from multiple or even single images. However, despite the progress, current arts still have difficulty generalizing to unseen images with complex cloth deformation and body poses. In this work, we present GarVerseLOD, a new dataset and framework that paves the way to achieving unprecedented robustness in high-fidelity 3D garment reconstruction from a single unconstrained image. Inspired by the recent success of large generative models, we believe that one key to addressing the generalization challenge lies in the quantity and quality of 3D garment data. Towards this end, GarVerseLOD collects 6,000 high-quality cloth models with fine-grained geometry details manually created by professional artists. In addition to the scale of training data, we observe that having disentangled granularities of geometry can play an important role in boosting the generalization capability and inference accuracy of the learned model. We hence craft GarVerseLOD as a hierarchical dataset with levels of details (LOD), spanning from detail-free stylized shape to pose-blended garment with pixel-aligned details. This allows us to make this highly under-constrained problem tractable by factorizing the inference into easier tasks, each narrowed down with smaller searching space. To ensure GarVerseLOD can generalize well to in-the-wild images, we propose a novel labeling paradigm based on conditional diffusion models to generate extensive paired images for each garment model with high photorealism. We evaluate our method on a massive amount of in-the-wild images. Experimental results demonstrate that GarVerseLOD can generate standalone garment pieces with significantly better quality than prior approaches. Project page: https://garverselod.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03047v1-abstract-full').style.display = 'none'; document.getElementById('2411.03047v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://garverselod.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19917">arXiv:2410.19917</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19917">pdf</a>, <a href="https://arxiv.org/format/2410.19917">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Collaborative Inference over Wireless Channels with Feature Differential Privacy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Seif%2C+M">Mohamed Seif</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuqi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Goldsmith%2C+A+J">Andrea J. Goldsmith</a>, <a href="/search/cs?searchtype=author&amp;query=Poor%2C+H+V">H. Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19917v1-abstract-short" style="display: inline;"> Collaborative inference among multiple wireless edge devices has the potential to significantly enhance Artificial Intelligence (AI) applications, particularly for sensing and computer vision. This approach typically involves a three-stage process: a) data acquisition through sensing, b) feature extraction, and c) feature encoding for transmission. However, transmitting the extracted features pose&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19917v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19917v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19917v1-abstract-full" style="display: none;"> Collaborative inference among multiple wireless edge devices has the potential to significantly enhance Artificial Intelligence (AI) applications, particularly for sensing and computer vision. This approach typically involves a three-stage process: a) data acquisition through sensing, b) feature extraction, and c) feature encoding for transmission. However, transmitting the extracted features poses a significant privacy risk, as sensitive personal data can be exposed during the process. To address this challenge, we propose a novel privacy-preserving collaborative inference mechanism, wherein each edge device in the network secures the privacy of extracted features before transmitting them to a central server for inference. Our approach is designed to achieve two primary objectives: 1) reducing communication overhead and 2) ensuring strict privacy guarantees during feature transmission, while maintaining effective inference performance. Additionally, we introduce an over-the-air pooling scheme specifically designed for classification tasks, which provides formal guarantees on the privacy of transmitted features and establishes a lower bound on classification accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19917v1-abstract-full').style.display = 'none'; document.getElementById('2410.19917v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work is under review for possible IEEE publication. arXiv admin note: substantial text overlap with arXiv:2406.00256</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11096">arXiv:2410.11096</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.11096">pdf</a>, <a href="https://arxiv.org/format/2410.11096">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SecCodePLT: A Unified Platform for Evaluating the Security of Code GenAI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuzhou Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yuheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+W">Wenbo Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+D">Dawn Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11096v1-abstract-short" style="display: inline;"> Existing works have established multiple benchmarks to highlight the security risks associated with Code GenAI. These risks are primarily reflected in two areas: a model potential to generate insecure code (insecure coding) and its utility in cyberattacks (cyberattack helpfulness). While these benchmarks have made significant strides, there remain opportunities for further improvement. For instanc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11096v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11096v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11096v1-abstract-full" style="display: none;"> Existing works have established multiple benchmarks to highlight the security risks associated with Code GenAI. These risks are primarily reflected in two areas: a model potential to generate insecure code (insecure coding) and its utility in cyberattacks (cyberattack helpfulness). While these benchmarks have made significant strides, there remain opportunities for further improvement. For instance, many current benchmarks tend to focus more on a model ability to provide attack suggestions rather than its capacity to generate executable attacks. Additionally, most benchmarks rely heavily on static evaluation metrics, which may not be as precise as dynamic metrics such as passing test cases. Conversely, expert-verified benchmarks, while offering high-quality data, often operate at a smaller scale. To address these gaps, we develop SecCodePLT, a unified and comprehensive evaluation platform for code GenAIs&#39; risks. For insecure code, we introduce a new methodology for data creation that combines experts with automatic generation. Our methodology ensures the data quality while enabling large-scale generation. We also associate samples with test cases to conduct code-related dynamic evaluation. For cyberattack helpfulness, we set up a real environment and construct samples to prompt a model to generate actual attacks, along with dynamic metrics in our environment. We conduct extensive experiments and show that SecCodePLT outperforms the state-of-the-art (SOTA) benchmark CyberSecEval in security relevance. Furthermore, it better identifies the security risks of SOTA models in insecure coding and cyberattack helpfulness. Finally, we apply SecCodePLT to the SOTA code agent, Cursor, and, for the first time, identify non-trivial security risks in this advanced coding agent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11096v1-abstract-full').style.display = 'none'; document.getElementById('2410.11096v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08858">arXiv:2410.08858</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08858">pdf</a>, <a href="https://arxiv.org/format/2410.08858">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Decoding Secret Memorization in Code LLMs Through Token-Level Characterization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuqing Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kailong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+G">Guoai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+G">Guosheng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haoyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08858v1-abstract-short" style="display: inline;"> Code Large Language Models (LLMs) have demonstrated remarkable capabilities in generating, understanding, and manipulating programming code. However, their training process inadvertently leads to the memorization of sensitive information, posing severe privacy risks. Existing studies on memorization in LLMs primarily rely on prompt engineering techniques, which suffer from limitations such as wide&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08858v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08858v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08858v1-abstract-full" style="display: none;"> Code Large Language Models (LLMs) have demonstrated remarkable capabilities in generating, understanding, and manipulating programming code. However, their training process inadvertently leads to the memorization of sensitive information, posing severe privacy risks. Existing studies on memorization in LLMs primarily rely on prompt engineering techniques, which suffer from limitations such as widespread hallucination and inefficient extraction of the target sensitive information. In this paper, we present a novel approach to characterize real and fake secrets generated by Code LLMs based on token probabilities. We identify four key characteristics that differentiate genuine secrets from hallucinated ones, providing insights into distinguishing real and fake secrets. To overcome the limitations of existing works, we propose DESEC, a two-stage method that leverages token-level features derived from the identified characteristics to guide the token decoding process. DESEC consists of constructing an offline token scoring model using a proxy Code LLM and employing the scoring model to guide the decoding process by reassigning token likelihoods. Through extensive experiments on four state-of-the-art Code LLMs using a diverse dataset, we demonstrate the superior performance of DESEC in achieving a higher plausible rate and extracting more real secrets compared to existing baselines. Our findings highlight the effectiveness of our token-level approach in enabling an extensive assessment of the privacy leakage risks associated with Code LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08858v1-abstract-full').style.display = 'none'; document.getElementById('2410.08858v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07701">arXiv:2410.07701</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07701">pdf</a>, <a href="https://arxiv.org/format/2410.07701">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Autonomous Driving in Unstructured Environments: How Far Have We Come? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Min%2C+C">Chen Min</a>, <a href="/search/cs?searchtype=author&amp;query=Si%2C+S">Shubin Si</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+H">Hanzhang Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+W">Weizhong Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Juan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Q">Qingtian Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Q">Qi Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+L">Lun Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+F">Fanjie Kong</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+J">Jinyu Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+X">Xudong Cai</a>, <a href="/search/cs?searchtype=author&amp;query=An%2C+S">Shuai An</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+J">Jilin Mei</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+T">Tong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+H">Heng Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+F">Fangzhou Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shang%2C+E">Erke Shang</a>, <a href="/search/cs?searchtype=author&amp;query=Shang%2C+L">Linzhi Shang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+K">Kunlong Zhao</a> , et al. (13 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07701v3-abstract-short" style="display: inline;"> Research on autonomous driving in unstructured outdoor environments is less advanced than in structured urban settings due to challenges like environmental diversities and scene complexity. These environments-such as rural areas and rugged terrains-pose unique obstacles that are not common in structured urban areas. Despite these difficulties, autonomous driving in unstructured outdoor environment&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07701v3-abstract-full').style.display = 'inline'; document.getElementById('2410.07701v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07701v3-abstract-full" style="display: none;"> Research on autonomous driving in unstructured outdoor environments is less advanced than in structured urban settings due to challenges like environmental diversities and scene complexity. These environments-such as rural areas and rugged terrains-pose unique obstacles that are not common in structured urban areas. Despite these difficulties, autonomous driving in unstructured outdoor environments is crucial for applications in agriculture, mining, and military operations. Our survey reviews over 250 papers for autonomous driving in unstructured outdoor environments, covering offline mapping, pose estimation, environmental perception, path planning, end-to-end autonomous driving, datasets, and relevant challenges. We also discuss emerging trends and future research directions. This review aims to consolidate knowledge and encourage further research for autonomous driving in unstructured environments. To support ongoing work, we maintain an active repository with up-to-date literature and open-source projects at: https://github.com/chaytonmin/Survey-Autonomous-Driving-in-Unstructured-Environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07701v3-abstract-full').style.display = 'none'; document.getElementById('2410.07701v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Survey paper; 38 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06886">arXiv:2410.06886</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06886">pdf</a>, <a href="https://arxiv.org/format/2410.06886">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FltLM: An Intergrated Long-Context Large Language Model for Effective Context Filtering and Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deng%2C+J">Jingyang Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Z">Zhengyang Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Boyang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+L">Lixin Su</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+S">Suqi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Ying Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Junfeng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+D">Dawei Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Jinwen Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06886v1-abstract-short" style="display: inline;"> The development of Long-Context Large Language Models (LLMs) has markedly advanced natural language processing by facilitating the process of textual data across long documents and multiple corpora. However, Long-Context LLMs still face two critical challenges: The lost in the middle phenomenon, where crucial middle-context information is likely to be missed, and the distraction issue that the mod&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06886v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06886v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06886v1-abstract-full" style="display: none;"> The development of Long-Context Large Language Models (LLMs) has markedly advanced natural language processing by facilitating the process of textual data across long documents and multiple corpora. However, Long-Context LLMs still face two critical challenges: The lost in the middle phenomenon, where crucial middle-context information is likely to be missed, and the distraction issue that the models lose focus due to overly extended contexts. To address these challenges, we propose the Context Filtering Language Model (FltLM), a novel integrated Long-Context LLM which enhances the ability of the model on multi-document question-answering (QA) tasks. Specifically, FltLM innovatively incorporates a context filter with a soft mask mechanism, identifying and dynamically excluding irrelevant content to concentrate on pertinent information for better comprehension and reasoning. Our approach not only mitigates these two challenges, but also enables the model to operate conveniently in a single forward pass. Experimental results demonstrate that FltLM significantly outperforms supervised fine-tuning and retrieval-based methods in complex QA scenarios, suggesting a promising solution for more accurate and reliable long-context natural language understanding applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06886v1-abstract-full').style.display = 'none'; document.getElementById('2410.06886v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 27th European Conference on Artificial Intelligence (ECAI-2024), this is the full version of the paper including technical appendices. This final version features enhanced formatting and corrections to errors present in other online versions. We regret any inconvenience this may have caused our readers</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18569">arXiv:2409.18569</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.18569">pdf</a>, <a href="https://arxiv.org/format/2409.18569">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-video Identity Correlating for Person Re-identification Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Ying Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Hanyu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Huaxin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+T">Tianyu Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Sang%2C+N">Nong Sang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Changxin Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18569v1-abstract-short" style="display: inline;"> Recent researches have proven that pre-training on large-scale person images extracted from internet videos is an effective way in learning better representations for person re-identification. However, these researches are mostly confined to pre-training at the instance-level or single-video tracklet-level. They ignore the identity-invariance in images of the same person across different videos, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18569v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18569v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18569v1-abstract-full" style="display: none;"> Recent researches have proven that pre-training on large-scale person images extracted from internet videos is an effective way in learning better representations for person re-identification. However, these researches are mostly confined to pre-training at the instance-level or single-video tracklet-level. They ignore the identity-invariance in images of the same person across different videos, which is a key focus in person re-identification. To address this issue, we propose a Cross-video Identity-cOrrelating pre-traiNing (CION) framework. Defining a noise concept that comprehensively considers both intra-identity consistency and inter-identity discrimination, CION seeks the identity correlation from cross-video images by modeling it as a progressive multi-level denoising problem. Furthermore, an identity-guided self-distillation loss is proposed to implement better large-scale pre-training by mining the identity-invariance within person images. We conduct extensive experiments to verify the superiority of our CION in terms of efficiency and performance. CION achieves significantly leading performance with even fewer training samples. For example, compared with the previous state-of-the-art~\cite{ISR}, CION with the same ResNet50-IBN achieves higher mAP of 93.3\% and 74.3\% on Market1501 and MSMT17, while only utilizing 8\% training samples. Finally, with CION demonstrating superior model-agnostic ability, we contribute a model zoo named ReIDZoo to meet diverse research and application needs in this field. It contains a series of CION pre-trained models with spanning structures and parameters, totaling 32 models with 10 different structures, including GhostNet, ConvNext, RepViT, FastViT and so on. The code and models will be made publicly available at https://github.com/Zplusdragon/CION_ReIDZoo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18569v1-abstract-full').style.display = 'none'; document.getElementById('2409.18569v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Accepted Paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16040">arXiv:2409.16040</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.16040">pdf</a>, <a href="https://arxiv.org/format/2409.16040">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Time-MoE: Billion-Scale Time Series Foundation Models with Mixture of Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shi%2C+X">Xiaoming Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shiyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuqi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dianqi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+Z">Zhou Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Q">Qingsong Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+M">Ming Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16040v3-abstract-short" style="display: inline;"> Deep learning for time series forecasting has seen significant advancements over the past decades. However, despite the success of large-scale pre-training in language and vision domains, pre-trained time series models remain limited in scale and operate at a high cost, hindering the development of larger capable forecasting models in real-world applications. In response, we introduce Time-MoE, a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16040v3-abstract-full').style.display = 'inline'; document.getElementById('2409.16040v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16040v3-abstract-full" style="display: none;"> Deep learning for time series forecasting has seen significant advancements over the past decades. However, despite the success of large-scale pre-training in language and vision domains, pre-trained time series models remain limited in scale and operate at a high cost, hindering the development of larger capable forecasting models in real-world applications. In response, we introduce Time-MoE, a scalable and unified architecture designed to pre-train larger, more capable forecasting foundation models while reducing inference costs. By leveraging a sparse mixture-of-experts (MoE) design, Time-MoE enhances computational efficiency by activating only a subset of networks for each prediction, reducing computational load while maintaining high model capacity. This allows Time-MoE to scale effectively without a corresponding increase in inference costs. Time-MoE comprises a family of decoder-only transformer models that operate in an auto-regressive manner and support flexible forecasting horizons with varying input context lengths. We pre-trained these models on our newly introduced large-scale data Time-300B, which spans over 9 domains and encompassing over 300 billion time points. For the first time, we scaled a time series foundation model up to 2.4 billion parameters, achieving significantly improved forecasting precision. Our results validate the applicability of scaling laws for training tokens and model size in the context of time series forecasting. Compared to dense models with the same number of activated parameters or equivalent computation budgets, our models consistently outperform them by large margin. These advancements position Time-MoE as a state-of-the-art solution for tackling real-world time series forecasting challenges with superior capability, efficiency, and flexibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16040v3-abstract-full').style.display = 'none'; document.getElementById('2409.16040v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 13th International Conference on Learning Representations (ICLR 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10899">arXiv:2408.10899</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10899">pdf</a>, <a href="https://arxiv.org/format/2408.10899">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> All Robots in One: A New Standard and Unified Dataset for Versatile, General-Purpose Embodied Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhiqiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+H">Hao Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yunshuang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+W">Wenjun Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qingwei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+H">Hua Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaidong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+X">Xuewen Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+W">Wanxi Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+C">Chang Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Liang Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+F">Feng Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10899v1-abstract-short" style="display: inline;"> Embodied AI is transforming how AI systems interact with the physical world, yet existing datasets are inadequate for developing versatile, general-purpose agents. These limitations include a lack of standardized formats, insufficient data diversity, and inadequate data volume. To address these issues, we introduce ARIO (All Robots In One), a new data standard that enhances existing datasets by of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10899v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10899v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10899v1-abstract-full" style="display: none;"> Embodied AI is transforming how AI systems interact with the physical world, yet existing datasets are inadequate for developing versatile, general-purpose agents. These limitations include a lack of standardized formats, insufficient data diversity, and inadequate data volume. To address these issues, we introduce ARIO (All Robots In One), a new data standard that enhances existing datasets by offering a unified data format, comprehensive sensory modalities, and a combination of real-world and simulated data. ARIO aims to improve the training of embodied AI agents, increasing their robustness and adaptability across various tasks and environments. Building upon the proposed new standard, we present a large-scale unified ARIO dataset, comprising approximately 3 million episodes collected from 258 series and 321,064 tasks. The ARIO standard and dataset represent a significant step towards bridging the gaps of existing data resources. By providing a cohesive framework for data collection and representation, ARIO paves the way for the development of more powerful and versatile embodied AI agents, capable of navigating and interacting with the physical world in increasingly complex and diverse ways. The project is available on https://imaei.github.io/project_pages/ario/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10899v1-abstract-full').style.display = 'none'; document.getElementById('2408.10899v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://imaei.github.io/project_pages/ario/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10006">arXiv:2408.10006</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10006">pdf</a>, <a href="https://arxiv.org/format/2408.10006">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unlocking the Power of LSTM for Long Term Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kong%2C+Y">Yaxuan Kong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zepu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuqi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+T">Tian Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zohren%2C+S">Stefan Zohren</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Y">Yuxuan Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+P">Peng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Q">Qingsong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10006v2-abstract-short" style="display: inline;"> Traditional recurrent neural network architectures, such as long short-term memory neural networks (LSTM), have historically held a prominent role in time series forecasting (TSF) tasks. While the recently introduced sLSTM for Natural Language Processing (NLP) introduces exponential gating and memory mixing that are beneficial for long term sequential learning, its potential short memory issue is&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10006v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10006v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10006v2-abstract-full" style="display: none;"> Traditional recurrent neural network architectures, such as long short-term memory neural networks (LSTM), have historically held a prominent role in time series forecasting (TSF) tasks. While the recently introduced sLSTM for Natural Language Processing (NLP) introduces exponential gating and memory mixing that are beneficial for long term sequential learning, its potential short memory issue is a barrier to applying sLSTM directly in TSF. To address this, we propose a simple yet efficient algorithm named P-sLSTM, which is built upon sLSTM by incorporating patching and channel independence. These modifications substantially enhance sLSTM&#39;s performance in TSF, achieving state-of-the-art results. Furthermore, we provide theoretical justifications for our design, and conduct extensive comparative and analytical experiments to fully validate the efficiency and superior performance of our model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10006v2-abstract-full').style.display = 'none'; document.getElementById('2408.10006v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 39th Annual AAAI Conference on Artificial Intelligence (AAAI 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09594">arXiv:2408.09594</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.09594">pdf</a>, <a href="https://arxiv.org/format/2408.09594">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Moonshine: Distilling Game Content Generators into Steerable Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuhe Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Middleton%2C+M">Michael Middleton</a>, <a href="/search/cs?searchtype=author&amp;query=Merino%2C+T">Tim Merino</a>, <a href="/search/cs?searchtype=author&amp;query=Kanagaraja%2C+N">Nidhushan Kanagaraja</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+A">Ashutosh Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+Z">Zhan Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Togelius%2C+J">Julian Togelius</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09594v3-abstract-short" style="display: inline;"> Procedural Content Generation via Machine Learning (PCGML) has enhanced game content creation, yet challenges in controllability and limited training data persist. This study addresses these issues by distilling a constructive PCG algorithm into a controllable PCGML model. We first generate a large amount of content with a constructive algorithm and label it using a Large Language Model (LLM). We&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09594v3-abstract-full').style.display = 'inline'; document.getElementById('2408.09594v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09594v3-abstract-full" style="display: none;"> Procedural Content Generation via Machine Learning (PCGML) has enhanced game content creation, yet challenges in controllability and limited training data persist. This study addresses these issues by distilling a constructive PCG algorithm into a controllable PCGML model. We first generate a large amount of content with a constructive algorithm and label it using a Large Language Model (LLM). We use these synthetic labels to condition two PCGML models for content-specific generation, a diffusion model and the five-dollar model. This neural network distillation process ensures that the generation aligns with the original algorithm while introducing controllability through plain text. We define this text-conditioned PCGML as a Text-to-game-Map (T2M) task, offering an alternative to prevalent text-to-image multi-modal tasks. We compare our distilled models with the baseline constructive algorithm. Our analysis of the variety, accuracy, and quality of our generation demonstrates the efficacy of distilling constructive methods into controllable text-conditioned PCGML models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09594v3-abstract-full').style.display = 'none'; document.getElementById('2408.09594v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07860">arXiv:2408.07860</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.07860">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Novel Generative Artificial Intelligence Method for Interference Study on Multiplex Brightfield Immunohistochemistry Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mukherjee%2C+S">Satarupa Mukherjee</a>, <a href="/search/cs?searchtype=author&amp;query=Martin%2C+J">Jim Martin</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yao Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07860v1-abstract-short" style="display: inline;"> Multiplex brightfield imaging offers the advantage of simultaneously analyzing multiple biomarkers on a single slide, as opposed to single biomarker labeling on multiple consecutive slides. To accurately analyze multiple biomarkers localized at the same cellular compartment, two representative biomarker sets were selected as assay models - cMET-PDL1-EGFR and CD8-LAG3-PDL1, where all three biomarke&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07860v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07860v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07860v1-abstract-full" style="display: none;"> Multiplex brightfield imaging offers the advantage of simultaneously analyzing multiple biomarkers on a single slide, as opposed to single biomarker labeling on multiple consecutive slides. To accurately analyze multiple biomarkers localized at the same cellular compartment, two representative biomarker sets were selected as assay models - cMET-PDL1-EGFR and CD8-LAG3-PDL1, where all three biomarkers can co-localize on the cell membrane. One of the most crucial preliminary stages for analyzing such assay is identifying each unique chromogen on individual cells. This is a challenging problem due to the co-localization of membrane stains from all the three biomarkers. It requires advanced color unmixing for creating the equivalent singleplex images from each triplex image for each biomarker. In this project, we developed a cycle-Generative Adversarial Network (cycle-GAN) method for unmixing the triplex images generated from the above-mentioned assays. Three different models were designed to generate the singleplex image for each of the three stains Tamra (purple), QM-Dabsyl (yellow) and Green. A notable novelty of our approach was that the input to the network were images in the optical density domain instead of conventionally used RGB images. The use of the optical density domain helped in reducing the blurriness of the synthetic singleplex images, which was often observed when the network was trained on RGB images. The cycle-GAN models were validated on 10,800 lung, gastric and colon images for the cMET-PDL1-EGFR assay and 3600 colon images for the CD8-LAG3-PDL1 assay. Visual as well as quantified assessments demonstrated that the proposed method is effective and efficient when compared with the manual reviewing results and is readily applicable to various multiplex assays. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07860v1-abstract-full').style.display = 'none'; document.getElementById('2408.07860v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09553">arXiv:2407.09553</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.09553">pdf</a>, <a href="https://arxiv.org/format/2407.09553">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DPEC: Dual-Path Error Compensation Method for Enhanced Low-Light Image Clarity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Q">Qianwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+B">Boxing Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yihe Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+Q">Qingchuan Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09553v4-abstract-short" style="display: inline;"> For the task of low-light image enhancement, deep learning-based algorithms have demonstrated superiority and effectiveness compared to traditional methods. However, these methods, primarily based on Retinex theory, tend to overlook the noise and color distortions in input images, leading to significant noise amplification and local color distortions in enhanced results. To address these issues, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09553v4-abstract-full').style.display = 'inline'; document.getElementById('2407.09553v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09553v4-abstract-full" style="display: none;"> For the task of low-light image enhancement, deep learning-based algorithms have demonstrated superiority and effectiveness compared to traditional methods. However, these methods, primarily based on Retinex theory, tend to overlook the noise and color distortions in input images, leading to significant noise amplification and local color distortions in enhanced results. To address these issues, we propose the Dual-Path Error Compensation (DPEC) method, designed to improve image quality under low-light conditions by preserving local texture details while restoring global image brightness without amplifying noise. DPEC incorporates precise pixel-level error estimation to capture subtle differences and an independent denoising mechanism to prevent noise amplification. We introduce the HIS-Retinex loss to guide DPEC&#39;s training, ensuring the brightness distribution of enhanced images closely aligns with real-world conditions. To balance computational speed and resource efficiency while training DPEC for a comprehensive understanding of the global context, we integrated the VMamba architecture into its backbone. Comprehensive quantitative and qualitative experimental results demonstrate that our algorithm significantly outperforms state-of-the-art methods in low-light image enhancement. The code is publicly available online at https://github.com/wangshuang233/DPEC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09553v4-abstract-full').style.display = 'none'; document.getElementById('2407.09553v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05840">arXiv:2407.05840</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.05840">pdf</a>, <a href="https://arxiv.org/format/2407.05840">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> A 103-TOPS/mm$^2$ Integrated Photonic Computing Engine Enabling Next-Generation Reservoir Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Dongliang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yikun Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+G">Gaolei Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Tsang%2C+H+K">Hon Ki Tsang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Chaoran Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05840v1-abstract-short" style="display: inline;"> Reservoir computing (RC) is a leading machine learning algorithm for information processing due to its rich expressiveness. A new RC paradigm has recently emerged, showcasing superior performance and delivering more interpretable results with shorter training data sets and training times, representing the next generation of RC computing. This work presents the first realization of a high-speed nex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05840v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05840v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05840v1-abstract-full" style="display: none;"> Reservoir computing (RC) is a leading machine learning algorithm for information processing due to its rich expressiveness. A new RC paradigm has recently emerged, showcasing superior performance and delivering more interpretable results with shorter training data sets and training times, representing the next generation of RC computing. This work presents the first realization of a high-speed next-generation RC system on an integrated photonic chip. Our experimental results demonstrate state-of-the-art forecasting and classification performances under various machine learning tasks and achieve the fastest speeds of 60 Gbaud and a computing density of 103 tera operations/second/mm$^2$ (TOPS/mm$^2$). The passive system, composed of a simple star coupler with on-chip delay lines, offers several advantages over traditional RC systems, including no speed limitations, compact footprint, extremely high fabrication error tolerance, fewer metaparameters, and greater interpretability. This work lays the foundation for ultrafast on-chip photonic RC, representing significant progress toward developing next-generation high-speed photonic computing and signal processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05840v1-abstract-full').style.display = 'none'; document.getElementById('2407.05840v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04277">arXiv:2407.04277</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04277">pdf</a>, <a href="https://arxiv.org/format/2407.04277">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Research, Applications and Prospects of Event-Based Pedestrian Detection: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuman Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hongjie Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Min Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+W">Wen Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yaoxiong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04277v1-abstract-short" style="display: inline;"> Event-based cameras, inspired by the biological retina, have evolved into cutting-edge sensors distinguished by their minimal power requirements, negligible latency, superior temporal resolution, and expansive dynamic range. At present, cameras used for pedestrian detection are mainly frame-based imaging sensors, which have suffered from lethargic response times and hefty data redundancy. In contr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04277v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04277v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04277v1-abstract-full" style="display: none;"> Event-based cameras, inspired by the biological retina, have evolved into cutting-edge sensors distinguished by their minimal power requirements, negligible latency, superior temporal resolution, and expansive dynamic range. At present, cameras used for pedestrian detection are mainly frame-based imaging sensors, which have suffered from lethargic response times and hefty data redundancy. In contrast, event-based cameras address these limitations by eschewing extraneous data transmissions and obviating motion blur in high-speed imaging scenarios. On pedestrian detection via event-based cameras, this paper offers an exhaustive review of research and applications particularly in the autonomous driving context. Through methodically scrutinizing relevant literature, the paper outlines the foundational principles, developmental trajectory, and the comparative merits and demerits of eventbased detection relative to traditional frame-based methodologies. This review conducts thorough analyses of various event stream inputs and their corresponding network models to evaluate their applicability across diverse operational environments. It also delves into pivotal elements such as crucial datasets and data acquisition techniques essential for advancing this technology, as well as advanced algorithms for processing event stream data. Culminating with a synthesis of the extant landscape, the review accentuates the unique advantages and persistent challenges inherent in event-based pedestrian detection, offering a prognostic view on potential future developments in this fast-progressing field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04277v1-abstract-full').style.display = 'none'; document.getElementById('2407.04277v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02301">arXiv:2407.02301</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.02301">pdf</a>, <a href="https://arxiv.org/format/2407.02301">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CFinBench: A Comprehensive Chinese Financial Benchmark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Ying Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+B">Binwei Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+T">Tianyu Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+W">Wei He</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+B">Binfan Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Weihao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+W">Weijian Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yunhe Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02301v1-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved remarkable performance on various NLP tasks, yet their potential in more challenging and domain-specific task, such as finance, has not been fully explored. In this paper, we present CFinBench: a meticulously crafted, the most comprehensive evaluation benchmark to date, for assessing the financial knowledge of LLMs under Chinese context. In practice, to b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02301v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02301v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02301v1-abstract-full" style="display: none;"> Large language models (LLMs) have achieved remarkable performance on various NLP tasks, yet their potential in more challenging and domain-specific task, such as finance, has not been fully explored. In this paper, we present CFinBench: a meticulously crafted, the most comprehensive evaluation benchmark to date, for assessing the financial knowledge of LLMs under Chinese context. In practice, to better align with the career trajectory of Chinese financial practitioners, we build a systematic evaluation from 4 first-level categories: (1) Financial Subject: whether LLMs can memorize the necessary basic knowledge of financial subjects, such as economics, statistics and auditing. (2) Financial Qualification: whether LLMs can obtain the needed financial qualified certifications, such as certified public accountant, securities qualification and banking qualification. (3) Financial Practice: whether LLMs can fulfill the practical financial jobs, such as tax consultant, junior accountant and securities analyst. (4) Financial Law: whether LLMs can meet the requirement of financial laws and regulations, such as tax law, insurance law and economic law. CFinBench comprises 99,100 questions spanning 43 second-level categories with 3 question types: single-choice, multiple-choice and judgment. We conduct extensive experiments of 50 representative LLMs with various model size on CFinBench. The results show that GPT4 and some Chinese-oriented models lead the benchmark, with the highest average accuracy being 60.16%, highlighting the challenge presented by CFinBench. The dataset and evaluation code are available at https://cfinbench.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02301v1-abstract-full').style.display = 'none'; document.getElementById('2407.02301v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11903">arXiv:2406.11903</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.11903">pdf</a>, <a href="https://arxiv.org/format/2406.11903">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="General Finance">q-fin.GN</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Large Language Models for Financial Applications: Progress, Prospects and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuqi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+Y">Yaxuan Kong</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+X">Xiaowen Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Mulvey%2C+J+M">John M. Mulvey</a>, <a href="/search/cs?searchtype=author&amp;query=Poor%2C+H+V">H. Vincent Poor</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Q">Qingsong Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Zohren%2C+S">Stefan Zohren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11903v1-abstract-short" style="display: inline;"> Recent advances in large language models (LLMs) have unlocked novel opportunities for machine learning applications in the financial domain. These models have demonstrated remarkable capabilities in understanding context, processing vast amounts of data, and generating human-preferred contents. In this survey, we explore the application of LLMs on various financial tasks, focusing on their potenti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11903v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11903v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11903v1-abstract-full" style="display: none;"> Recent advances in large language models (LLMs) have unlocked novel opportunities for machine learning applications in the financial domain. These models have demonstrated remarkable capabilities in understanding context, processing vast amounts of data, and generating human-preferred contents. In this survey, we explore the application of LLMs on various financial tasks, focusing on their potential to transform traditional practices and drive innovation. We provide a discussion of the progress and advantages of LLMs in financial contexts, analyzing their advanced technologies as well as prospective capabilities in contextual understanding, transfer learning flexibility, complex emotion detection, etc. We then highlight this survey for categorizing the existing literature into key application areas, including linguistic tasks, sentiment analysis, financial time series, financial reasoning, agent-based modeling, and other applications. For each application area, we delve into specific methodologies, such as textual analysis, knowledge-based analysis, forecasting, data augmentation, planning, decision support, and simulations. Furthermore, a comprehensive collection of datasets, model assets, and useful codes associated with mainstream applications are presented as resources for the researchers and practitioners. Finally, we outline the challenges and opportunities for future research, particularly emphasizing a number of distinctive aspects in this field. We hope our work can help facilitate the adoption and further development of LLMs in the financial sector. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11903v1-abstract-full').style.display = 'none'; document.getElementById('2406.11903v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08725">arXiv:2406.08725</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.08725">pdf</a>, <a href="https://arxiv.org/format/2406.08725">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> RL-JACK: Reinforcement Learning-powered Black-box Jailbreaking Attack against LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuzhou Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+L">Lu Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+Y">Yunshu Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+W">Wenbo Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiangyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08725v1-abstract-short" style="display: inline;"> Modern large language model (LLM) developers typically conduct a safety alignment to prevent an LLM from generating unethical or harmful content. Recent studies have discovered that the safety alignment of LLMs can be bypassed by jailbreaking prompts. These prompts are designed to create specific conversation scenarios with a harmful question embedded. Querying an LLM with such prompts can mislead&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08725v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08725v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08725v1-abstract-full" style="display: none;"> Modern large language model (LLM) developers typically conduct a safety alignment to prevent an LLM from generating unethical or harmful content. Recent studies have discovered that the safety alignment of LLMs can be bypassed by jailbreaking prompts. These prompts are designed to create specific conversation scenarios with a harmful question embedded. Querying an LLM with such prompts can mislead the model into responding to the harmful question. The stochastic and random nature of existing genetic methods largely limits the effectiveness and efficiency of state-of-the-art (SOTA) jailbreaking attacks. In this paper, we propose RL-JACK, a novel black-box jailbreaking attack powered by deep reinforcement learning (DRL). We formulate the generation of jailbreaking prompts as a search problem and design a novel RL approach to solve it. Our method includes a series of customized designs to enhance the RL agent&#39;s learning efficiency in the jailbreaking context. Notably, we devise an LLM-facilitated action space that enables diverse action variations while constraining the overall search space. We propose a novel reward function that provides meaningful dense rewards for the agent toward achieving successful jailbreaking. Through extensive evaluations, we demonstrate that RL-JACK is overall much more effective than existing jailbreaking attacks against six SOTA LLMs, including large open-source models and commercial models. We also show the RL-JACK&#39;s resiliency against three SOTA defenses and its transferability across different models. Finally, we validate the insensitivity of RL-JACK to the variations in key hyper-parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08725v1-abstract-full').style.display = 'none'; document.getElementById('2406.08725v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08705">arXiv:2406.08705</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.08705">pdf</a>, <a href="https://arxiv.org/format/2406.08705">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> When LLM Meets DRL: Advancing Jailbreaking Efficiency via DRL-guided Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuzhou Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+W">Wenbo Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiangyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08705v4-abstract-short" style="display: inline;"> Recent studies developed jailbreaking attacks, which construct jailbreaking prompts to fool LLMs into responding to harmful questions. Early-stage jailbreaking attacks require access to model internals or significant human efforts. More advanced attacks utilize genetic algorithms for automatic and black-box attacks. However, the random nature of genetic algorithms significantly limits the effectiv&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08705v4-abstract-full').style.display = 'inline'; document.getElementById('2406.08705v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08705v4-abstract-full" style="display: none;"> Recent studies developed jailbreaking attacks, which construct jailbreaking prompts to fool LLMs into responding to harmful questions. Early-stage jailbreaking attacks require access to model internals or significant human efforts. More advanced attacks utilize genetic algorithms for automatic and black-box attacks. However, the random nature of genetic algorithms significantly limits the effectiveness of these attacks. In this paper, we propose RLbreaker, a black-box jailbreaking attack driven by deep reinforcement learning (DRL). We model jailbreaking as a search problem and design an RL agent to guide the search, which is more effective and has less randomness than stochastic search, such as genetic algorithms. Specifically, we design a customized DRL system for the jailbreaking problem, including a novel reward function and a customized proximal policy optimization (PPO) algorithm. Through extensive experiments, we demonstrate that RLbreaker is much more effective than existing jailbreaking attacks against six state-of-the-art (SOTA) LLMs. We also show that RLbreaker is robust against three SOTA defenses and its trained agents can transfer across different LLMs. We further validate the key design choices of RLbreaker via a comprehensive ablation study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08705v4-abstract-full').style.display = 'none'; document.getElementById('2406.08705v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NeurIPS 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00256">arXiv:2406.00256</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.00256">pdf</a>, <a href="https://arxiv.org/format/2406.00256">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Over-the-Air Collaborative Inference with Feature Differential Privacy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Seif%2C+M">Mohamed Seif</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuqi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Goldsmith%2C+A">Andrea Goldsmith</a>, <a href="/search/cs?searchtype=author&amp;query=Poor%2C+V">Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00256v1-abstract-short" style="display: inline;"> Collaborative inference in next-generation networks can enhance Artificial Intelligence (AI) applications, including autonomous driving, personal identification, and activity classification. This method involves a three-stage process: a) data acquisition through sensing, b) feature extraction, and c) feature encoding for transmission. Transmission of the extracted features entails the potential ri&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00256v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00256v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00256v1-abstract-full" style="display: none;"> Collaborative inference in next-generation networks can enhance Artificial Intelligence (AI) applications, including autonomous driving, personal identification, and activity classification. This method involves a three-stage process: a) data acquisition through sensing, b) feature extraction, and c) feature encoding for transmission. Transmission of the extracted features entails the potential risk of exposing sensitive personal data. To address this issue, in this work a new privacy-protecting collaborative inference mechanism is developed. Under this mechanism, each edge device in the network protects the privacy of extracted features before transmitting them to a central server for inference. This mechanism aims to achieve two main objectives while ensuring effective inference performance: 1) reducing communication overhead, and 2) maintaining strict privacy guarantees during features transmission. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00256v1-abstract-full').style.display = 'none'; document.getElementById('2406.00256v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18721">arXiv:2405.18721</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.18721">pdf</a>, <a href="https://arxiv.org/format/2405.18721">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TPAMI.2024.3407759">10.1109/TPAMI.2024.3407759 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Correctable Landmark Discovery via Large Models for Vision-Language Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+B">Bingqian Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yunshuang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Z">Ziming Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yi Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+S">Shikui Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jianzhuang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18721v2-abstract-short" style="display: inline;"> Vision-Language Navigation (VLN) requires the agent to follow language instructions to reach a target position. A key factor for successful navigation is to align the landmarks implied in the instruction with diverse visual observations. However, previous VLN agents fail to perform accurate modality alignment especially in unexplored scenes, since they learn from limited navigation data and lack s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18721v2-abstract-full').style.display = 'inline'; document.getElementById('2405.18721v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18721v2-abstract-full" style="display: none;"> Vision-Language Navigation (VLN) requires the agent to follow language instructions to reach a target position. A key factor for successful navigation is to align the landmarks implied in the instruction with diverse visual observations. However, previous VLN agents fail to perform accurate modality alignment especially in unexplored scenes, since they learn from limited navigation data and lack sufficient open-world alignment knowledge. In this work, we propose a new VLN paradigm, called COrrectable LaNdmark DiScOvery via Large ModEls (CONSOLE). In CONSOLE, we cast VLN as an open-world sequential landmark discovery problem, by introducing a novel correctable landmark discovery scheme based on two large models ChatGPT and CLIP. Specifically, we use ChatGPT to provide rich open-world landmark cooccurrence commonsense, and conduct CLIP-driven landmark discovery based on these commonsense priors. To mitigate the noise in the priors due to the lack of visual constraints, we introduce a learnable cooccurrence scoring module, which corrects the importance of each cooccurrence according to actual observations for accurate landmark discovery. We further design an observation enhancement strategy for an elegant combination of our framework with different VLN agents, where we utilize the corrected landmark features to obtain enhanced observation features for action decision. Extensive experimental results on multiple popular VLN benchmarks (R2R, REVERIE, R4R, RxR) show the significant superiority of CONSOLE over strong baselines. Especially, our CONSOLE establishes the new state-of-the-art results on R2R and R4R in unseen scenarios. Code is available at https://github.com/expectorlin/CONSOLE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18721v2-abstract-full').style.display = 'none'; document.getElementById('2405.18721v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TPAMI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16783">arXiv:2405.16783</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.16783">pdf</a>, <a href="https://arxiv.org/format/2405.16783">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TrojFM: Resource-efficient Backdoor Attacks against Very Large Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuzhou. Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yanting. Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+J">Jinyuan. Jia</a>, <a href="/search/cs?searchtype=author&amp;query=De+Lucia%2C+M+J">Michael J. De Lucia</a>, <a href="/search/cs?searchtype=author&amp;query=Bastian%2C+N+D">Nathaniel D. Bastian</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+W">Wenbo. Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+D">Dawn. Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16783v1-abstract-short" style="display: inline;"> One key challenge in backdoor attacks against large foundation models is the resource limits. Backdoor attacks usually require retraining the target model, which is impractical for very large foundation models. Existing backdoor attacks are mainly designed for supervised classifiers or small foundation models (e.g., BERT). None of these attacks has successfully compromised a very large foundation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16783v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16783v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16783v1-abstract-full" style="display: none;"> One key challenge in backdoor attacks against large foundation models is the resource limits. Backdoor attacks usually require retraining the target model, which is impractical for very large foundation models. Existing backdoor attacks are mainly designed for supervised classifiers or small foundation models (e.g., BERT). None of these attacks has successfully compromised a very large foundation model, such as Llama-3-70B, especially with limited computational resources. In this paper, we propose TrojFM, a novel backdoor attack tailored for very large foundation models. Our primary technical contribution is the development of a novel backdoor injection method. This method forces a backdoored model to generate similar hidden representations for poisoned inputs regardless of their actual semantics. Our approach injects such backdoors by fine-tuning only a very small proportion of model parameters. This enables TrojFM to efficiently launch downstream task-agnostic backdoor attacks against very large foundation models under limited computational resources. Moreover, we optimize the fine-tuning process with our customized QLoRA technique, enabling launching our attack via only~\textit{one A100 GPU}. Furthermore, we design a new trigger injection method to ensure our attack stealthiness. Through extensive experiments, we first demonstrate that TrojFM can launch effective backdoor attacks against widely used large GPT-style models without jeopardizing their normal functionalities (and outperforming existing attacks on BERT-style models). Furthermore, we show that TrojFM is resilient to SOTA defenses and is insensitive to changes in key hyper-parameters. Finally, we conduct a resource analysis to quantify that our method can significantly save computational and memory costs compared to existing backdoor attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16783v1-abstract-full').style.display = 'none'; document.getElementById('2405.16783v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13581">arXiv:2405.13581</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.13581">pdf</a>, <a href="https://arxiv.org/format/2405.13581">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Safety Alignment for Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhendong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuanbi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Y">Yingshui Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Yue%2C+X">Xiangyu Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Q">Qiushi Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chongjun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xiaoyong Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13581v1-abstract-short" style="display: inline;"> Benefiting from the powerful capabilities of Large Language Models (LLMs), pre-trained visual encoder models connected to an LLMs can realize Vision Language Models (VLMs). However, existing research shows that the visual modality of VLMs is vulnerable, with attackers easily bypassing LLMs&#39; safety alignment through visual modality features to launch attacks. To address this issue, we enhance the e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13581v1-abstract-full').style.display = 'inline'; document.getElementById('2405.13581v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13581v1-abstract-full" style="display: none;"> Benefiting from the powerful capabilities of Large Language Models (LLMs), pre-trained visual encoder models connected to an LLMs can realize Vision Language Models (VLMs). However, existing research shows that the visual modality of VLMs is vulnerable, with attackers easily bypassing LLMs&#39; safety alignment through visual modality features to launch attacks. To address this issue, we enhance the existing VLMs&#39; visual modality safety alignment by adding safety modules, including a safety projector, safety tokens, and a safety head, through a two-stage training process, effectively improving the model&#39;s defense against risky images. For example, building upon the LLaVA-v1.5 model, we achieve a safety score of 8.26, surpassing the GPT-4V on the Red Teaming Visual Language Models (RTVLM) benchmark. Our method boasts ease of use, high flexibility, and strong controllability, and it enhances safety while having minimal impact on the model&#39;s general performance. Moreover, our alignment strategy also uncovers some possible risky content within commonly used open-source multimodal datasets. Our code will be open sourced after the anonymous review. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13581v1-abstract-full').style.display = 'none'; document.getElementById('2405.13581v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04390">arXiv:2405.04390</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.04390">pdf</a>, <a href="https://arxiv.org/format/2405.04390">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DriveWorld: 4D Pre-trained Scene Understanding via World Models for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Min%2C+C">Chen Min</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+D">Dawei Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+L">Liang Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Jian Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+X">Xinli Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zheng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+L">Lei Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jianshu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Y">Yulan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Xing%2C+J">Junliang Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Jing%2C+L">Liping Jing</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yiming Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+B">Bin Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04390v1-abstract-short" style="display: inline;"> Vision-centric autonomous driving has recently raised wide attention due to its lower cost. Pre-training is essential for extracting a universal representation. However, current vision-centric pre-training typically relies on either 2D or 3D pre-text tasks, overlooking the temporal characteristics of autonomous driving as a 4D scene understanding task. In this paper, we address this challenge by i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04390v1-abstract-full').style.display = 'inline'; document.getElementById('2405.04390v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04390v1-abstract-full" style="display: none;"> Vision-centric autonomous driving has recently raised wide attention due to its lower cost. Pre-training is essential for extracting a universal representation. However, current vision-centric pre-training typically relies on either 2D or 3D pre-text tasks, overlooking the temporal characteristics of autonomous driving as a 4D scene understanding task. In this paper, we address this challenge by introducing a world model-based autonomous driving 4D representation learning framework, dubbed \emph{DriveWorld}, which is capable of pre-training from multi-camera driving videos in a spatio-temporal fashion. Specifically, we propose a Memory State-Space Model for spatio-temporal modelling, which consists of a Dynamic Memory Bank module for learning temporal-aware latent dynamics to predict future changes and a Static Scene Propagation module for learning spatial-aware latent statics to offer comprehensive scene contexts. We additionally introduce a Task Prompt to decouple task-aware features for various downstream tasks. The experiments demonstrate that DriveWorld delivers promising results on various autonomous driving tasks. When pre-trained with the OpenScene dataset, DriveWorld achieves a 7.5% increase in mAP for 3D object detection, a 3.0% increase in IoU for online mapping, a 5.0% increase in AMOTA for multi-object tracking, a 0.1m decrease in minADE for motion forecasting, a 3.0% increase in IoU for occupancy prediction, and a 0.34m reduction in average L2 error for planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04390v1-abstract-full').style.display = 'none'; document.getElementById('2405.04390v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02357">arXiv:2405.02357</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.02357">pdf</a>, <a href="https://arxiv.org/format/2405.02357">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models for Mobility Analysis in Transportation Systems: A Survey on Forecasting Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zijian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Yujie Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zepu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuqi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaobo Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Ruolin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+P">Peng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Ban%2C+X">Xuegang Ban</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02357v2-abstract-short" style="display: inline;"> Mobility analysis is a crucial element in the research area of transportation systems. Forecasting traffic information offers a viable solution to address the conflict between increasing transportation demands and the limitations of transportation infrastructure. Predicting human travel is significant in aiding various transportation and urban management tasks, such as taxi dispatch and urban plan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02357v2-abstract-full').style.display = 'inline'; document.getElementById('2405.02357v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02357v2-abstract-full" style="display: none;"> Mobility analysis is a crucial element in the research area of transportation systems. Forecasting traffic information offers a viable solution to address the conflict between increasing transportation demands and the limitations of transportation infrastructure. Predicting human travel is significant in aiding various transportation and urban management tasks, such as taxi dispatch and urban planning. Machine learning and deep learning methods are favored for their flexibility and accuracy. Nowadays, with the advent of large language models (LLMs), many researchers have combined these models with previous techniques or applied LLMs to directly predict future traffic information and human travel behaviors. However, there is a lack of comprehensive studies on how LLMs can contribute to this field. This survey explores existing approaches using LLMs for time series forecasting problems for mobility in transportation systems. We provide a literature review concerning the forecasting applications within transportation systems, elucidating how researchers utilize LLMs, showcasing recent state-of-the-art advancements, and identifying the challenges that must be overcome to fully leverage LLMs in this domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02357v2-abstract-full').style.display = 'none'; document.getElementById('2405.02357v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, presented in 2025 TRB meeting</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15538">arXiv:2404.15538</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.15538">pdf</a>, <a href="https://arxiv.org/format/2404.15538">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DreamCraft: Text-Guided Generation of Functional 3D Environments in Minecraft </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Earle%2C+S">Sam Earle</a>, <a href="/search/cs?searchtype=author&amp;query=Kokkinos%2C+F">Filippos Kokkinos</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuhe Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Togelius%2C+J">Julian Togelius</a>, <a href="/search/cs?searchtype=author&amp;query=Raileanu%2C+R">Roberta Raileanu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15538v1-abstract-short" style="display: inline;"> Procedural Content Generation (PCG) algorithms enable the automatic generation of complex and diverse artifacts. However, they don&#39;t provide high-level control over the generated content and typically require domain expertise. In contrast, text-to-3D methods allow users to specify desired characteristics in natural language, offering a high amount of flexibility and expressivity. But unlike PCG, s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15538v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15538v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15538v1-abstract-full" style="display: none;"> Procedural Content Generation (PCG) algorithms enable the automatic generation of complex and diverse artifacts. However, they don&#39;t provide high-level control over the generated content and typically require domain expertise. In contrast, text-to-3D methods allow users to specify desired characteristics in natural language, offering a high amount of flexibility and expressivity. But unlike PCG, such approaches cannot guarantee functionality, which is crucial for certain applications like game design. In this paper, we present a method for generating functional 3D artifacts from free-form text prompts in the open-world game Minecraft. Our method, DreamCraft, trains quantized Neural Radiance Fields (NeRFs) to represent artifacts that, when viewed in-game, match given text descriptions. We find that DreamCraft produces more aligned in-game artifacts than a baseline that post-processes the output of an unconstrained NeRF. Thanks to the quantized representation of the environment, functional constraints can be integrated using specialized loss terms. We show how this can be leveraged to generate 3D structures that match a target distribution or obey certain adjacency rules over the block types. DreamCraft inherits a high degree of expressivity and controllability from the NeRF, while still being able to incorporate functional constraints through domain-specific objectives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15538v1-abstract-full').style.display = 'none'; document.getElementById('2404.15538v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 9 figures, accepted to Foundation of Digital Games 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15127">arXiv:2404.15127</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.15127">pdf</a>, <a href="https://arxiv.org/format/2404.15127">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GSCo: Towards Generalizable AI in Medicine via Generalist-Specialist Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=He%2C+S">Sunan He</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuxiang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongmei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yihui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Z">Zhiyuan Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhixuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yingxue Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+L">Luyang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+H">Huiling Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+X">Xi Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+M">Mingxiang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Shih%2C+G">George Shih</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Ziyang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xian Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qiong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chan%2C+R+C+K">Ronald Cheong Kin Chan</a>, <a href="/search/cs?searchtype=author&amp;query=Vardhanabhuti%2C+V">Varut Vardhanabhuti</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+W+C+W">Winnie Chiu Wing Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Y">Yefeng Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Rajpurkar%2C+P">Pranav Rajpurkar</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15127v2-abstract-short" style="display: inline;"> Generalist foundation models (GFMs) are renowned for their exceptional capability and flexibility in effectively generalizing across diverse tasks and modalities. In the field of medicine, while GFMs exhibit superior generalizability based on their extensive intrinsic knowledge as well as proficiency in instruction following and in-context learning, specialist models excel in precision due to thei&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15127v2-abstract-full').style.display = 'inline'; document.getElementById('2404.15127v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15127v2-abstract-full" style="display: none;"> Generalist foundation models (GFMs) are renowned for their exceptional capability and flexibility in effectively generalizing across diverse tasks and modalities. In the field of medicine, while GFMs exhibit superior generalizability based on their extensive intrinsic knowledge as well as proficiency in instruction following and in-context learning, specialist models excel in precision due to their domain knowledge. In this work, for the first time, we explore the synergy between the GFM and specialist models, to enable precise medical image analysis on a broader scope. Specifically, we propose a cooperative framework, Generalist-Specialist Collaboration (GSCo), which consists of two stages, namely the construction of GFM and specialists, and collaborative inference on downstream tasks. In the construction stage, we develop MedDr, the largest open-source GFM tailored for medicine, showcasing exceptional instruction-following and in-context learning capabilities. Meanwhile, a series of lightweight specialists are crafted for downstream tasks with low computational cost. In the collaborative inference stage, we introduce two cooperative mechanisms, Mixture-of-Expert Diagnosis and Retrieval-Augmented Diagnosis, to harvest the generalist&#39;s in-context learning abilities alongside the specialists&#39; domain expertise. For a comprehensive evaluation, we curate a large-scale benchmark featuring 28 datasets and about 250,000 images. Extensive results demonstrate that MedDr consistently outperforms state-of-the-art GFMs on downstream datasets. Furthermore, GSCo exceeds both GFMs and specialists across all out-of-domain disease diagnosis datasets. These findings indicate a significant paradigm shift in the application of GFMs, transitioning from separate models for specific tasks to a collaborative approach between GFMs and specialists, thereby advancing the frontiers of generalizable AI in medicine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15127v2-abstract-full').style.display = 'none'; document.getElementById('2404.15127v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03264">arXiv:2404.03264</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.03264">pdf</a>, <a href="https://arxiv.org/format/2404.03264">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Foundation Model for Advancing Healthcare: Challenges, Opportunities, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yuting He</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+F">Fuxiang Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+X">Xinrui Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuxiang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Minghao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiguang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03264v1-abstract-short" style="display: inline;"> Foundation model, which is pre-trained on broad data and is able to adapt to a wide range of tasks, is advancing healthcare. It promotes the development of healthcare artificial intelligence (AI) models, breaking the contradiction between limited AI models and diverse healthcare practices. Much more widespread healthcare scenarios will benefit from the development of a healthcare foundation model&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03264v1-abstract-full').style.display = 'inline'; document.getElementById('2404.03264v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03264v1-abstract-full" style="display: none;"> Foundation model, which is pre-trained on broad data and is able to adapt to a wide range of tasks, is advancing healthcare. It promotes the development of healthcare artificial intelligence (AI) models, breaking the contradiction between limited AI models and diverse healthcare practices. Much more widespread healthcare scenarios will benefit from the development of a healthcare foundation model (HFM), improving their advanced intelligent healthcare services. Despite the impending widespread deployment of HFMs, there is currently a lack of clear understanding about how they work in the healthcare field, their current challenges, and where they are headed in the future. To answer these questions, a comprehensive and deep survey of the challenges, opportunities, and future directions of HFMs is presented in this survey. It first conducted a comprehensive overview of the HFM including the methods, data, and applications for a quick grasp of the current progress. Then, it made an in-depth exploration of the challenges present in data, algorithms, and computing infrastructures for constructing and widespread application of foundation models in healthcare. This survey also identifies emerging and promising directions in this field for future development. We believe that this survey will enhance the community&#39;s comprehension of the current progress of HFM and serve as a valuable source of guidance for future development in this field. The latest HFM papers and related resources are maintained on our website: https://github.com/YutingHe-list/Awesome-Foundation-Models-for-Advancing-Healthcare. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03264v1-abstract-full').style.display = 'none'; document.getElementById('2404.03264v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19632">arXiv:2403.19632</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.19632">pdf</a>, <a href="https://arxiv.org/format/2403.19632">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GauStudio: A Modular Framework for 3D Gaussian Splatting and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+C">Chongjie Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yinyu Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+J">Jiahao Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yuantao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhi%2C+Y">Yihao Zhi</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xiaoguang Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19632v1-abstract-short" style="display: inline;"> We present GauStudio, a novel modular framework for modeling 3D Gaussian Splatting (3DGS) to provide standardized, plug-and-play components for users to easily customize and implement a 3DGS pipeline. Supported by our framework, we propose a hybrid Gaussian representation with foreground and skyball background models. Experiments demonstrate this representation reduces artifacts in unbounded outdo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19632v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19632v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19632v1-abstract-full" style="display: none;"> We present GauStudio, a novel modular framework for modeling 3D Gaussian Splatting (3DGS) to provide standardized, plug-and-play components for users to easily customize and implement a 3DGS pipeline. Supported by our framework, we propose a hybrid Gaussian representation with foreground and skyball background models. Experiments demonstrate this representation reduces artifacts in unbounded outdoor scenes and improves novel view synthesis. Finally, we propose Gaussian Splatting Surface Reconstruction (GauS), a novel render-then-fuse approach for high-fidelity mesh reconstruction from 3DGS inputs without fine-tuning. Overall, our GauStudio framework, hybrid representation, and GauS approach enhance 3DGS modeling and rendering capabilities, enabling higher-quality novel view synthesis and surface reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19632v1-abstract-full').style.display = 'none'; document.getElementById('2403.19632v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/GAP-LAB-CUHK-SZ/gaustudio</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19319">arXiv:2403.19319</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.19319">pdf</a>, <a href="https://arxiv.org/format/2403.19319">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mesh2NeRF: Direct Mesh Supervision for Neural Radiance Field Representation and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yujin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yinyu Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Ummenhofer%2C+B">Benjamin Ummenhofer</a>, <a href="/search/cs?searchtype=author&amp;query=Birkl%2C+R">Reiner Birkl</a>, <a href="/search/cs?searchtype=author&amp;query=Paulitsch%2C+M">Michael Paulitsch</a>, <a href="/search/cs?searchtype=author&amp;query=M%C3%BCller%2C+M">Matthias M眉ller</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%C3%9Fner%2C+M">Matthias Nie脽ner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19319v2-abstract-short" style="display: inline;"> We present Mesh2NeRF, an approach to derive ground-truth radiance fields from textured meshes for 3D generation tasks. Many 3D generative approaches represent 3D scenes as radiance fields for training. Their ground-truth radiance fields are usually fitted from multi-view renderings from a large-scale synthetic 3D dataset, which often results in artifacts due to occlusions or under-fitting issues.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19319v2-abstract-full').style.display = 'inline'; document.getElementById('2403.19319v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19319v2-abstract-full" style="display: none;"> We present Mesh2NeRF, an approach to derive ground-truth radiance fields from textured meshes for 3D generation tasks. Many 3D generative approaches represent 3D scenes as radiance fields for training. Their ground-truth radiance fields are usually fitted from multi-view renderings from a large-scale synthetic 3D dataset, which often results in artifacts due to occlusions or under-fitting issues. In Mesh2NeRF, we propose an analytic solution to directly obtain ground-truth radiance fields from 3D meshes, characterizing the density field with an occupancy function featuring a defined surface thickness, and determining view-dependent color through a reflection function considering both the mesh and environment lighting. Mesh2NeRF extracts accurate radiance fields which provides direct supervision for training generative NeRFs and single scene representation. We validate the effectiveness of Mesh2NeRF across various tasks, achieving a noteworthy 3.12dB improvement in PSNR for view synthesis in single scene representation on the ABO dataset, a 0.69 PSNR enhancement in the single-view conditional generation of ShapeNet Cars, and notably improved mesh extraction from NeRF in the unconditional generation of Objaverse Mugs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19319v2-abstract-full').style.display = 'none'; document.getElementById('2403.19319v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024, Project page: https://terencecyj.github.io/projects/Mesh2NeRF/ Video: https://youtu.be/SsFkhSuQYGM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17636">arXiv:2403.17636</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.17636">pdf</a>, <a href="https://arxiv.org/format/2403.17636">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Mix-Initiative Response Generation with Dynamic Prefix Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuxiang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Heyan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xian-Ling Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+L">Lizi Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17636v2-abstract-short" style="display: inline;"> Mixed initiative serves as one of the key factors in controlling conversation directions. For a speaker, responding passively or leading proactively would result in rather different responses. However, most dialogue systems focus on training a holistic response generation model without any distinction among different initiatives. It leads to the cross-contamination problem, where the model confuse&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17636v2-abstract-full').style.display = 'inline'; document.getElementById('2403.17636v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17636v2-abstract-full" style="display: none;"> Mixed initiative serves as one of the key factors in controlling conversation directions. For a speaker, responding passively or leading proactively would result in rather different responses. However, most dialogue systems focus on training a holistic response generation model without any distinction among different initiatives. It leads to the cross-contamination problem, where the model confuses different initiatives and generates inappropriate responses. Moreover, obtaining plenty of human annotations for initiative labels can be expensive. To address this issue, we propose a general mix-Initiative Dynamic Prefix Tuning framework (IDPT) to decouple different initiatives from the generation model, which learns initiative-aware prefixes in both supervised and unsupervised settings. Specifically, IDPT decouples initiative factors into different prefix parameters and uses the attention mechanism to adjust the selection of initiatives in guiding generation dynamically. The prefix parameters can be tuned towards accurate initiative prediction as well as mix-initiative response generation. Extensive experiments on two public dialogue datasets show that the proposed IDPT outperforms previous baselines on both automatic metrics and human evaluations. It also manages to generate appropriate responses with manipulated initiatives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17636v2-abstract-full').style.display = 'none'; document.getElementById('2403.17636v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the main conference of NAACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.16558">arXiv:2403.16558</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.16558">pdf</a>, <a href="https://arxiv.org/format/2403.16558">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Elysium: Exploring Object-level Perception in Videos via MLLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yanjie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+Y">Yongjie Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuxiang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Can Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.16558v2-abstract-short" style="display: inline;"> Multi-modal Large Language Models (MLLMs) have demonstrated their ability to perceive objects in still images, but their application in video-related tasks, such as object tracking, remains understudied. This lack of exploration is primarily due to two key challenges. Firstly, extensive pretraining on large-scale video datasets is required to equip MLLMs with the capability to perceive objects acr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16558v2-abstract-full').style.display = 'inline'; document.getElementById('2403.16558v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.16558v2-abstract-full" style="display: none;"> Multi-modal Large Language Models (MLLMs) have demonstrated their ability to perceive objects in still images, but their application in video-related tasks, such as object tracking, remains understudied. This lack of exploration is primarily due to two key challenges. Firstly, extensive pretraining on large-scale video datasets is required to equip MLLMs with the capability to perceive objects across multiple frames and understand inter-frame relationships. Secondly, processing a large number of frames within the context window of Large Language Models (LLMs) can impose a significant computational burden. To address the first challenge, we introduce ElysiumTrack-1M, a large-scale video dataset supported for three tasks: Single Object Tracking (SOT), Referring Single Object Tracking (RSOT), and Video Referring Expression Generation (Video-REG). ElysiumTrack-1M contains 1.27 million annotated video frames with corresponding object boxes and descriptions. Leveraging this dataset, we conduct training of MLLMs and propose a token-compression model T-Selector to tackle the second challenge. Our proposed approach, Elysium: Exploring Object-level Perception in Videos via MLLM, is an end-to-end trainable MLLM that attempts to conduct object-level tasks in videos without requiring any additional plug-in or expert models. All codes and datasets are available at https://github.com/Hon-Wong/Elysium. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16558v2-abstract-full').style.display = 'none'; document.getElementById('2403.16558v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14735">arXiv:2403.14735</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.14735">pdf</a>, <a href="https://arxiv.org/format/2403.14735">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3637528.3671451">10.1145/3637528.3671451 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Foundation Models for Time Series Analysis: A Tutorial and Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Y">Yuxuan Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+H">Haomin Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuqi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yushan Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+M">Ming Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+D">Dongjin Song</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+S">Shirui Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Q">Qingsong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14735v3-abstract-short" style="display: inline;"> Time series analysis stands as a focal point within the data mining community, serving as a cornerstone for extracting valuable insights crucial to a myriad of real-world applications. Recent advances in Foundation Models (FMs) have fundamentally reshaped the paradigm of model design for time series analysis, boosting various downstream tasks in practice. These innovative approaches often leverage&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14735v3-abstract-full').style.display = 'inline'; document.getElementById('2403.14735v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14735v3-abstract-full" style="display: none;"> Time series analysis stands as a focal point within the data mining community, serving as a cornerstone for extracting valuable insights crucial to a myriad of real-world applications. Recent advances in Foundation Models (FMs) have fundamentally reshaped the paradigm of model design for time series analysis, boosting various downstream tasks in practice. These innovative approaches often leverage pre-trained or fine-tuned FMs to harness generalized knowledge tailored for time series analysis. This survey aims to furnish a comprehensive and up-to-date overview of FMs for time series analysis. While prior surveys have predominantly focused on either application or pipeline aspects of FMs in time series analysis, they have often lacked an in-depth understanding of the underlying mechanisms that elucidate why and how FMs benefit time series analysis. To address this gap, our survey adopts a methodology-centric classification, delineating various pivotal elements of time-series FMs, including model architectures, pre-training techniques, adaptation methods, and data modalities. Overall, this survey serves to consolidate the latest advancements in FMs pertinent to time series analysis, accentuating their theoretical underpinnings, recent strides in development, and avenues for future exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14735v3-abstract-full').style.display = 'none'; document.getElementById('2403.14735v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD&#39;24)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11401">arXiv:2403.11401</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.11401">pdf</a>, <a href="https://arxiv.org/format/2403.11401">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Scene-LLM: Extending Language Model for 3D Visual Understanding and Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fu%2C+R">Rao Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jingyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xilun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yixin Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+W">Wenhan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11401v2-abstract-short" style="display: inline;"> This paper introduces Scene-LLM, a 3D-visual-language model that enhances embodied agents&#39; abilities in interactive 3D indoor environments by integrating the reasoning strengths of Large Language Models (LLMs). Scene-LLM adopts a hybrid 3D visual feature representation, that incorporates dense spatial information and supports scene state updates. The model employs a projection layer to efficiently&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11401v2-abstract-full').style.display = 'inline'; document.getElementById('2403.11401v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11401v2-abstract-full" style="display: none;"> This paper introduces Scene-LLM, a 3D-visual-language model that enhances embodied agents&#39; abilities in interactive 3D indoor environments by integrating the reasoning strengths of Large Language Models (LLMs). Scene-LLM adopts a hybrid 3D visual feature representation, that incorporates dense spatial information and supports scene state updates. The model employs a projection layer to efficiently project these features in the pre-trained textual embedding space, enabling effective interpretation of 3D visual information. Unique to our approach is the integration of both scene-level and ego-centric 3D information. This combination is pivotal for interactive planning, where scene-level data supports global planning and ego-centric data is important for localization. Notably, we use ego-centric 3D frame features for feature alignment, an efficient technique that enhances the model&#39;s ability to align features of small objects within the scene. Our experiments with Scene-LLM demonstrate its strong capabilities in dense captioning, question answering, and interactive planning. We believe Scene-LLM advances the field of 3D visual understanding and reasoning, offering new possibilities for sophisticated agent interactions in indoor settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11401v2-abstract-full').style.display = 'none'; document.getElementById('2403.11401v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.07376">arXiv:2403.07376</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.07376">pdf</a>, <a href="https://arxiv.org/format/2403.07376">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> NavCoT: Boosting LLM-Based Vision-and-Language Navigation via Learning Disentangled Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+B">Bingqian Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yunshuang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Z">Ziming Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiaqi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+S">Shikui Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+J">Jianhua Han</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.07376v1-abstract-short" style="display: inline;"> Vision-and-Language Navigation (VLN), as a crucial research problem of Embodied AI, requires an embodied agent to navigate through complex 3D environments following natural language instructions. Recent research has highlighted the promising capacity of large language models (LLMs) in VLN by improving navigational reasoning accuracy and interpretability. However, their predominant use in an offlin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07376v1-abstract-full').style.display = 'inline'; document.getElementById('2403.07376v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.07376v1-abstract-full" style="display: none;"> Vision-and-Language Navigation (VLN), as a crucial research problem of Embodied AI, requires an embodied agent to navigate through complex 3D environments following natural language instructions. Recent research has highlighted the promising capacity of large language models (LLMs) in VLN by improving navigational reasoning accuracy and interpretability. However, their predominant use in an offline manner usually suffers from substantial domain gap between the VLN task and the LLM training corpus. This paper introduces a novel strategy called Navigational Chain-of-Thought (NavCoT), where we fulfill parameter-efficient in-domain training to enable self-guided navigational decision, leading to a significant mitigation of the domain gap in a cost-effective manner. Specifically, at each timestep, the LLM is prompted to forecast the navigational chain-of-thought by: 1) acting as a world model to imagine the next observation according to the instruction, 2) selecting the candidate observation that best aligns with the imagination, and 3) determining the action based on the reasoning from the prior steps. Through constructing formalized labels for training, the LLM can learn to generate desired and reasonable chain-of-thought outputs for improving the action decision. Experimental results across various training settings and popular VLN benchmarks (e.g., Room-to-Room (R2R), Room-across-Room (RxR), Room-for-Room (R4R)) show the significant superiority of NavCoT over the direct action prediction variants. Through simple parameter-efficient finetuning, our NavCoT outperforms a recent GPT4-based approach with ~7% relative improvement on the R2R dataset. We believe that NavCoT will help unlock more task-adaptive and scalable LLM-based embodied agents, which are helpful for developing real-world robotics applications. Code is available at https://github.com/expectorlin/NavCoT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07376v1-abstract-full').style.display = 'none'; document.getElementById('2403.07376v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.02074">arXiv:2402.02074</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.02074">pdf</a>, <a href="https://arxiv.org/format/2402.02074">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-RoI Human Mesh Recovery with Camera Consistency and Contrastive Losses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yongwei Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Changzhen Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+C">Chengjiang Long</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Guiqing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+H">Hongmin Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.02074v2-abstract-short" style="display: inline;"> Besides a 3D mesh, Human Mesh Recovery (HMR) methods usually need to estimate a camera for computing 2D reprojection loss. Previous approaches may encounter the following problem: both the mesh and camera are not correct but the combination of them can yield a low reprojection loss. To alleviate this problem, we define multiple RoIs (region of interest) containing the same human and propose a mult&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02074v2-abstract-full').style.display = 'inline'; document.getElementById('2402.02074v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.02074v2-abstract-full" style="display: none;"> Besides a 3D mesh, Human Mesh Recovery (HMR) methods usually need to estimate a camera for computing 2D reprojection loss. Previous approaches may encounter the following problem: both the mesh and camera are not correct but the combination of them can yield a low reprojection loss. To alleviate this problem, we define multiple RoIs (region of interest) containing the same human and propose a multiple-RoI-based HMR method. Our key idea is that with multiple RoIs as input, we can estimate multiple local cameras and have the opportunity to design and apply additional constraints between cameras to improve the accuracy of the cameras and, in turn, the accuracy of the corresponding 3D mesh. To implement this idea, we propose a RoI-aware feature fusion network by which we estimate a 3D mesh shared by all RoIs as well as local cameras corresponding to the RoIs. We observe that local cameras can be converted to the camera of the full image through which we construct a local camera consistency loss as the additional constraint imposed on local cameras. Another benefit of introducing multiple RoIs is that we can encapsulate our network into a contrastive learning framework and apply a contrastive loss to regularize the training of our network. Experiments demonstrate the effectiveness of our multi-RoI HMR method and superiority to recent prior arts. Our code is available at https://github.com/CptDiaos/Multi-RoI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02074v2-abstract-full').style.display = 'none'; document.getElementById('2402.02074v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.01253">arXiv:2402.01253</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.01253">pdf</a>, <a href="https://arxiv.org/format/2402.01253">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> RimiRec: Modeling Refined Multi-interest in Hierarchical Structure for Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pei%2C+H">Haolei Pei</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yuanyuan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yangping Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuan Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.01253v3-abstract-short" style="display: inline;"> Industrial recommender systems usually consist of the retrieval stage and the ranking stage, to handle the billion-scale of users and items. The retrieval stage retrieves candidate items relevant to user interests for recommendations and has attracted much attention. Frequently, a user shows refined multi-interests in a hierarchical structure. For example, a user likes Conan and Kuroba Kaito, whic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01253v3-abstract-full').style.display = 'inline'; document.getElementById('2402.01253v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.01253v3-abstract-full" style="display: none;"> Industrial recommender systems usually consist of the retrieval stage and the ranking stage, to handle the billion-scale of users and items. The retrieval stage retrieves candidate items relevant to user interests for recommendations and has attracted much attention. Frequently, a user shows refined multi-interests in a hierarchical structure. For example, a user likes Conan and Kuroba Kaito, which are the roles in hierarchical structure &#34;Animation, Japanese Animation, Detective Conan&#34;. However, most existing methods ignore this hierarchical nature, and simply average the fine-grained interest information. Therefore, we propose a novel two-stage approach to explicitly modeling refined multi-interest in a hierarchical structure for recommendation. In the first hierarchical multi-interest mining stage, the hierarchical clustering and transformer-based model adaptively generate circles or sub-circles that users are interested in. In the second stage, the partition of retrieval space allows the EBR models to deal only with items within each circle and accurately capture users&#39; refined interests. Experimental results show that the proposed approach achieves state-of-the-art performance. Our framework has also been deployed at Lofter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01253v3-abstract-full').style.display = 'none'; document.getElementById('2402.01253v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.14121">arXiv:2401.14121</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.14121">pdf</a>, <a href="https://arxiv.org/format/2401.14121">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Incorporating Test-Time Optimization into Training with Dual Networks for Human Mesh Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yongwei Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+M">Mingxian Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+C">Chengjiang Long</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jian Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+X">Xuemiao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.14121v2-abstract-short" style="display: inline;"> Human Mesh Recovery (HMR) is the task of estimating a parameterized 3D human mesh from an image. There is a kind of methods first training a regression model for this problem, then further optimizing the pretrained regression model for any specific sample individually at test time. However, the pretrained model may not provide an ideal optimization starting point for the test-time optimization. In&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14121v2-abstract-full').style.display = 'inline'; document.getElementById('2401.14121v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.14121v2-abstract-full" style="display: none;"> Human Mesh Recovery (HMR) is the task of estimating a parameterized 3D human mesh from an image. There is a kind of methods first training a regression model for this problem, then further optimizing the pretrained regression model for any specific sample individually at test time. However, the pretrained model may not provide an ideal optimization starting point for the test-time optimization. Inspired by meta-learning, we incorporate the test-time optimization into training, performing a step of test-time optimization for each sample in the training batch before really conducting the training optimization over all the training samples. In this way, we obtain a meta-model, the meta-parameter of which is friendly to the test-time optimization. At test time, after several test-time optimization steps starting from the meta-parameter, we obtain much higher HMR accuracy than the test-time optimization starting from the simply pretrained regression model. Furthermore, we find test-time HMR objectives are different from training-time objectives, which reduces the effectiveness of the learning of the meta-model. To solve this problem, we propose a dual-network architecture that unifies the training-time and test-time objectives. Our method, armed with meta-learning and the dual networks, outperforms state-of-the-art regression-based and optimization-based HMR approaches, as validated by the extensive experiments. The codes are available at https://github.com/fmx789/Meta-HMR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14121v2-abstract-full').style.display = 'none'; document.getElementById('2401.14121v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Nie%2C+Y&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Nie%2C+Y&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Nie%2C+Y&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Nie%2C+Y&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Nie%2C+Y&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10