Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,403 results for author: <span class="mathjax">Mao, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Mao%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Mao, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Mao%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Mao, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Mao%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Mao%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Mao%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Mao%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Mao%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Mao%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17310">arXiv:2411.17310</a> <span> [<a href="https://arxiv.org/pdf/2411.17310">pdf</a>, <a href="https://arxiv.org/format/2411.17310">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Reward Incremental Learning in Text-to-Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Maorong Wang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jiafeng Mao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueting Wang</a>, <a href="/search/cs?searchtype=author&query=Yamasaki%2C+T">Toshihiko Yamasaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17310v1-abstract-short" style="display: inline;"> The recent success of denoising diffusion models has significantly advanced text-to-image generation. While these large-scale pretrained models show excellent performance in general image synthesis, downstream objectives often require fine-tuning to meet specific criteria such as aesthetics or human preference. Reward gradient-based strategies are promising in this context, yet existing methods ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17310v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17310v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17310v1-abstract-full" style="display: none;"> The recent success of denoising diffusion models has significantly advanced text-to-image generation. While these large-scale pretrained models show excellent performance in general image synthesis, downstream objectives often require fine-tuning to meet specific criteria such as aesthetics or human preference. Reward gradient-based strategies are promising in this context, yet existing methods are limited to single-reward tasks, restricting their applicability in real-world scenarios that demand adapting to multiple objectives introduced incrementally over time. In this paper, we first define this more realistic and unexplored problem, termed Reward Incremental Learning (RIL), where models are desired to adapt to multiple downstream objectives incrementally. Additionally, while the models adapt to the ever-emerging new objectives, we observe a unique form of catastrophic forgetting in diffusion model fine-tuning, affecting both metric-wise and visual structure-wise image quality. To address this catastrophic forgetting challenge, we propose Reward Incremental Distillation (RID), a method that mitigates forgetting with minimal computational overhead, enabling stable performance across sequential reward tasks. The experimental results demonstrate the efficacy of RID in achieving consistent, high-quality generation in RIL scenarios. The source code of our work will be publicly available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17310v1-abstract-full').style.display = 'none'; document.getElementById('2411.17310v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16796">arXiv:2411.16796</a> <span> [<a href="https://arxiv.org/pdf/2411.16796">pdf</a>, <a href="https://arxiv.org/format/2411.16796">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Efficient Model-Heterogeneity Federated Learning for Large Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+R">Ruofan Jia</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Weiying Xie</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+J">Jie Lei</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Haonan Qin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jitao Ma</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Leyuan Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16796v1-abstract-short" style="display: inline;"> As demand grows for complex tasks and high-performance applications in edge computing, the deployment of large models in federated learning has become increasingly urgent, given their superior representational power and generalization capabilities. However, the resource constraints and heterogeneity among clients present significant challenges to this deployment. To tackle these challenges, we int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16796v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16796v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16796v1-abstract-full" style="display: none;"> As demand grows for complex tasks and high-performance applications in edge computing, the deployment of large models in federated learning has become increasingly urgent, given their superior representational power and generalization capabilities. However, the resource constraints and heterogeneity among clients present significant challenges to this deployment. To tackle these challenges, we introduce HeteroTune, an innovative fine-tuning framework tailored for model-heterogeneity federated learning (MHFL). In particular, we propose a novel parameter-efficient fine-tuning (PEFT) structure, called FedAdapter, which employs a multi-branch cross-model aggregator to enable efficient knowledge aggregation across diverse models. Benefiting from the lightweight FedAdapter, our approach significantly reduces both the computational and communication overhead. Finally, our approach is simple yet effective, making it applicable to a wide range of large model fine-tuning tasks. Extensive experiments on computer vision (CV) and natural language processing (NLP) tasks demonstrate that our method achieves state-of-the-art results, seamlessly integrating efficiency and performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16796v1-abstract-full').style.display = 'none'; document.getElementById('2411.16796v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8pages, 5figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.11 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16726">arXiv:2411.16726</a> <span> [<a href="https://arxiv.org/pdf/2411.16726">pdf</a>, <a href="https://arxiv.org/format/2411.16726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EmotiveTalk: Expressive Talking Head Generation through Audio Information Decoupling and Emotional Video Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haotian Wang</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+Y">Yuzhe Weng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yueyan Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zilu Guo</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shutong Niu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiefeng Ma</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shan He</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoyan Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qiming Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+B">Bing Yin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cong Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingfeng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16726v1-abstract-short" style="display: inline;"> Diffusion models have revolutionized the field of talking head generation, yet still face challenges in expressiveness, controllability, and stability in long-time generation. In this research, we propose an EmotiveTalk framework to address these issues. Firstly, to realize better control over the generation of lip movement and facial expression, a Vision-guided Audio Information Decoupling (V-AID… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16726v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16726v1-abstract-full" style="display: none;"> Diffusion models have revolutionized the field of talking head generation, yet still face challenges in expressiveness, controllability, and stability in long-time generation. In this research, we propose an EmotiveTalk framework to address these issues. Firstly, to realize better control over the generation of lip movement and facial expression, a Vision-guided Audio Information Decoupling (V-AID) approach is designed to generate audio-based decoupled representations aligned with lip movements and expression. Specifically, to achieve alignment between audio and facial expression representation spaces, we present a Diffusion-based Co-speech Temporal Expansion (Di-CTE) module within V-AID to generate expression-related representations under multi-source emotion condition constraints. Then we propose a well-designed Emotional Talking Head Diffusion (ETHD) backbone to efficiently generate highly expressive talking head videos, which contains an Expression Decoupling Injection (EDI) module to automatically decouple the expressions from reference portraits while integrating the target expression information, achieving more expressive generation performance. Experimental results show that EmotiveTalk can generate expressive talking head videos, ensuring the promised controllability of emotions and stability during long-time generation, yielding state-of-the-art performance compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16726v1-abstract-full').style.display = 'none'; document.getElementById('2411.16726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19pages, 16figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15720">arXiv:2411.15720</a> <span> [<a href="https://arxiv.org/pdf/2411.15720">pdf</a>, <a href="https://arxiv.org/format/2411.15720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Chain of Attack: On the Robustness of Vision-Language Models Against Transfer-Based Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+P">Peng Xie</a>, <a href="/search/cs?searchtype=author&query=Bie%2C+Y">Yequan Bie</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jianda Mao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yangqiu Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kani Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15720v1-abstract-short" style="display: inline;"> Pre-trained vision-language models (VLMs) have showcased remarkable performance in image and natural language understanding, such as image captioning and response generation. As the practical applications of vision-language models become increasingly widespread, their potential safety and robustness issues raise concerns that adversaries may evade the system and cause these models to generate toxi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15720v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15720v1-abstract-full" style="display: none;"> Pre-trained vision-language models (VLMs) have showcased remarkable performance in image and natural language understanding, such as image captioning and response generation. As the practical applications of vision-language models become increasingly widespread, their potential safety and robustness issues raise concerns that adversaries may evade the system and cause these models to generate toxic content through malicious attacks. Therefore, evaluating the robustness of open-source VLMs against adversarial attacks has garnered growing attention, with transfer-based attacks as a representative black-box attacking strategy. However, most existing transfer-based attacks neglect the importance of the semantic correlations between vision and text modalities, leading to sub-optimal adversarial example generation and attack performance. To address this issue, we present Chain of Attack (CoA), which iteratively enhances the generation of adversarial examples based on the multi-modal semantic update using a series of intermediate attacking steps, achieving superior adversarial transferability and efficiency. A unified attack success rate computing method is further proposed for automatic evasion evaluation. Extensive experiments conducted under the most realistic and high-stakes scenario, demonstrate that our attacking strategy can effectively mislead models to generate targeted responses using only black-box attacks without any knowledge of the victim models. The comprehensive robustness evaluation in our paper provides insight into the vulnerabilities of VLMs and offers a reference for the safety considerations of future model developments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15720v1-abstract-full').style.display = 'none'; document.getElementById('2411.15720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15447">arXiv:2411.15447</a> <span> [<a href="https://arxiv.org/pdf/2411.15447">pdf</a>, <a href="https://arxiv.org/format/2411.15447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Gotta Hear Them All: Sound Source Aware Vision to Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+W">Wei Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianbo Ma</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Weidong Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15447v2-abstract-short" style="display: inline;"> Vision-to-audio (V2A) synthesis has broad applications in multimedia. Recent advancements of V2A methods have made it possible to generate relevant audios from inputs of videos or still images. However, the immersiveness and expressiveness of the generation are limited. One possible problem is that existing methods solely rely on the global scene and overlook details of local sounding objects (i.e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15447v2-abstract-full').style.display = 'inline'; document.getElementById('2411.15447v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15447v2-abstract-full" style="display: none;"> Vision-to-audio (V2A) synthesis has broad applications in multimedia. Recent advancements of V2A methods have made it possible to generate relevant audios from inputs of videos or still images. However, the immersiveness and expressiveness of the generation are limited. One possible problem is that existing methods solely rely on the global scene and overlook details of local sounding objects (i.e., sound sources). To address this issue, we propose a Sound Source-Aware V2A (SSV2A) generator. SSV2A is able to locally perceive multimodal sound sources from a scene with visual detection and cross-modality translation. It then contrastively learns a Cross-Modal Sound Source (CMSS) Manifold to semantically disambiguate each source. Finally, we attentively mix their CMSS semantics into a rich audio representation, from which a pretrained audio generator outputs the sound. To model the CMSS manifold, we curate a novel single-sound-source visual-audio dataset VGGS3 from VGGSound. We also design a Sound Source Matching Score to measure localized audio relevance. This is to our knowledge the first work to address V2A generation at the sound-source level. Extensive experiments show that SSV2A surpasses state-of-the-art methods in both generation fidelity and relevance. We further demonstrate SSV2A's ability to achieve intuitive V2A control by compositing vision, text, and audio conditions. Our SSV2A generation can be tried and heard at https://ssv2a.github.io/SSV2A-demo . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15447v2-abstract-full').style.display = 'none'; document.getElementById('2411.15447v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 9 figures, source code released at https://github.com/wguo86/SSV2A</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15271">arXiv:2411.15271</a> <span> [<a href="https://arxiv.org/pdf/2411.15271">pdf</a>, <a href="https://arxiv.org/format/2411.15271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EADReg: Probabilistic Correspondence Generation with Efficient Autoregressive Diffusion Model for Outdoor Point Cloud Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+L">Linrui Gong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiuming Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Junyi Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lihao Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaonan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hesheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15271v1-abstract-short" style="display: inline;"> Diffusion models have shown the great potential in the point cloud registration (PCR) task, especially for enhancing the robustness to challenging cases. However, existing diffusion-based PCR methods primarily focus on instance-level scenarios and struggle with outdoor LiDAR points, where the sparsity, irregularity, and huge point scale inherent in LiDAR points pose challenges to establishing dens… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15271v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15271v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15271v1-abstract-full" style="display: none;"> Diffusion models have shown the great potential in the point cloud registration (PCR) task, especially for enhancing the robustness to challenging cases. However, existing diffusion-based PCR methods primarily focus on instance-level scenarios and struggle with outdoor LiDAR points, where the sparsity, irregularity, and huge point scale inherent in LiDAR points pose challenges to establishing dense global point-to-point correspondences. To address this issue, we propose a novel framework named EADReg for efficient and robust registration of LiDAR point clouds based on autoregressive diffusion models. EADReg follows a coarse-to-fine registration paradigm. In the coarse stage, we employ a Bi-directional Gaussian Mixture Model (BGMM) to reject outlier points and obtain purified point cloud pairs. BGMM establishes correspondences between the Gaussian Mixture Models (GMMs) from the source and target frames, enabling reliable coarse registration based on filtered features and geometric information. In the fine stage, we treat diffusion-based PCR as an autoregressive process to generate robust point correspondences, which are then iteratively refined on upper layers. Despite common criticisms of diffusion-based methods regarding inference speed, EADReg achieves runtime comparable to convolutional-based methods. Extensive experiments on the KITTI and NuScenes benchmark datasets highlight the state-of-the-art performance of our proposed method. Codes will be released upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15271v1-abstract-full').style.display = 'none'; document.getElementById('2411.15271v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15041">arXiv:2411.15041</a> <span> [<a href="https://arxiv.org/pdf/2411.15041">pdf</a>, <a href="https://arxiv.org/format/2411.15041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> mR$^2$AG: Multimodal Retrieval-Reflection-Augmented Generation for Knowledge-Based VQA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zongyang Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zhongang Qi</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chunfeng Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bing Li</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+J">Junfu Pu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuxuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zehua Xie</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jin Ma</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Ying Shan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Weiming Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15041v1-abstract-short" style="display: inline;"> Advanced Multimodal Large Language Models (MLLMs) struggle with recent Knowledge-based VQA tasks, such as INFOSEEK and Encyclopedic-VQA, due to their limited and frozen knowledge scope, often leading to ambiguous and inaccurate responses. Thus, multimodal Retrieval-Augmented Generation (mRAG) is naturally introduced to provide MLLMs with comprehensive and up-to-date knowledge, effectively expandin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15041v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15041v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15041v1-abstract-full" style="display: none;"> Advanced Multimodal Large Language Models (MLLMs) struggle with recent Knowledge-based VQA tasks, such as INFOSEEK and Encyclopedic-VQA, due to their limited and frozen knowledge scope, often leading to ambiguous and inaccurate responses. Thus, multimodal Retrieval-Augmented Generation (mRAG) is naturally introduced to provide MLLMs with comprehensive and up-to-date knowledge, effectively expanding the knowledge scope. However, current mRAG methods have inherent drawbacks, including: 1) Performing retrieval even when external knowledge is not needed. 2) Lacking of identification of evidence that supports the query. 3) Increasing model complexity due to additional information filtering modules or rules. To address these shortcomings, we propose a novel generalized framework called \textbf{m}ultimodal \textbf{R}etrieval-\textbf{R}eflection-\textbf{A}ugmented \textbf{G}eneration (mR$^2$AG), which achieves adaptive retrieval and useful information localization to enable answers through two easy-to-implement reflection operations, preventing high model complexity. In mR$^2$AG, Retrieval-Reflection is designed to distinguish different user queries and avoids redundant retrieval calls, and Relevance-Reflection is introduced to guide the MLLM in locating beneficial evidence of the retrieved content and generating answers accordingly. In addition, mR$^2$AG can be integrated into any well-trained MLLM with efficient fine-tuning on the proposed mR$^2$AG Instruction-Tuning dataset (mR$^2$AG-IT). mR$^2$AG significantly outperforms state-of-the-art MLLMs (e.g., GPT-4v/o) and RAG-based MLLMs on INFOSEEK and Encyclopedic-VQA, while maintaining the exceptional capabilities of base MLLMs across a wide range of Visual-dependent tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15041v1-abstract-full').style.display = 'none'; document.getElementById('2411.15041v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14927">arXiv:2411.14927</a> <span> [<a href="https://arxiv.org/pdf/2411.14927">pdf</a>, <a href="https://arxiv.org/format/2411.14927">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LiDAR-based End-to-end Temporal Perception for Vehicle-Infrastructure Cooperation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenwei Yang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jilei Mao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenxian Yang</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+Y">Yibo Ai</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Y">Yu Kong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haibao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weidong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14927v1-abstract-short" style="display: inline;"> Temporal perception, the ability to detect and track objects over time, is critical in autonomous driving for maintaining a comprehensive understanding of dynamic environments. However, this task is hindered by significant challenges, including incomplete perception caused by occluded objects and observational blind spots, which are common in single-vehicle perception systems. To address these iss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14927v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14927v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14927v1-abstract-full" style="display: none;"> Temporal perception, the ability to detect and track objects over time, is critical in autonomous driving for maintaining a comprehensive understanding of dynamic environments. However, this task is hindered by significant challenges, including incomplete perception caused by occluded objects and observational blind spots, which are common in single-vehicle perception systems. To address these issues, we introduce LET-VIC, a LiDAR-based End-to-End Tracking framework for Vehicle-Infrastructure Cooperation (VIC). LET-VIC leverages Vehicle-to-Everything (V2X) communication to enhance temporal perception by fusing spatial and temporal data from both vehicle and infrastructure sensors. First, it spatially integrates Bird's Eye View (BEV) features from vehicle-side and infrastructure-side LiDAR data, creating a comprehensive view that mitigates occlusions and compensates for blind spots. Second, LET-VIC incorporates temporal context across frames, allowing the model to leverage historical data for enhanced tracking stability and accuracy. To further improve robustness, LET-VIC includes a Calibration Error Compensation (CEC) module to address sensor misalignments and ensure precise feature alignment. Experiments on the V2X-Seq-SPD dataset demonstrate that LET-VIC significantly outperforms baseline models, achieving at least a 13.7% improvement in mAP and a 13.1% improvement in AMOTA without considering communication delays. This work offers a practical solution and a new research direction for advancing temporal perception in autonomous driving through vehicle-infrastructure cooperation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14927v1-abstract-full').style.display = 'none'; document.getElementById('2411.14927v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14743">arXiv:2411.14743</a> <span> [<a href="https://arxiv.org/pdf/2411.14743">pdf</a>, <a href="https://arxiv.org/format/2411.14743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> FOCUS: Knowledge-enhanced Adaptive Visual Compression for Few-shot Whole Slide Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhengrui Guo</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Conghao Xiong</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiabo Ma</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qichen Sun</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+L">Lishuang Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinzhuo Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14743v1-abstract-short" style="display: inline;"> Few-shot learning presents a critical solution for cancer diagnosis in computational pathology (CPath), addressing fundamental limitations in data availability, particularly the scarcity of expert annotations and patient privacy constraints. A key challenge in this paradigm stems from the inherent disparity between the limited training set of whole slide images (WSIs) and the enormous number of co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14743v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14743v1-abstract-full" style="display: none;"> Few-shot learning presents a critical solution for cancer diagnosis in computational pathology (CPath), addressing fundamental limitations in data availability, particularly the scarcity of expert annotations and patient privacy constraints. A key challenge in this paradigm stems from the inherent disparity between the limited training set of whole slide images (WSIs) and the enormous number of contained patches, where a significant portion of these patches lacks diagnostically relevant information, potentially diluting the model's ability to learn and focus on critical diagnostic features. While recent works attempt to address this by incorporating additional knowledge, several crucial gaps hinder further progress: (1) despite the emergence of powerful pathology foundation models (FMs), their potential remains largely untapped, with most approaches limiting their use to basic feature extraction; (2) current language guidance mechanisms attempt to align text prompts with vast numbers of WSI patches all at once, struggling to leverage rich pathological semantic information. To this end, we introduce the knowledge-enhanced adaptive visual compression framework, dubbed FOCUS, which uniquely combines pathology FMs with language prior knowledge to enable a focused analysis of diagnostically relevant regions by prioritizing discriminative WSI patches. Our approach implements a progressive three-stage compression strategy: we first leverage FMs for global visual redundancy elimination, and integrate compressed features with language prompts for semantic relevance assessment, then perform neighbor-aware visual token filtering while preserving spatial coherence. Extensive experiments on pathological datasets spanning breast, lung, and ovarian cancers demonstrate its superior performance in few-shot pathology diagnosis. Code will be made available at https://github.com/dddavid4real/FOCUS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14743v1-abstract-full').style.display = 'none'; document.getElementById('2411.14743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14169">arXiv:2411.14169</a> <span> [<a href="https://arxiv.org/pdf/2411.14169">pdf</a>, <a href="https://arxiv.org/format/2411.14169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spatiotemporal Decoupling for Efficient Vision-Based Occupancy Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingyi Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xieyuanli Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Junyi Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jintao Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+L">Ling Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14169v1-abstract-short" style="display: inline;"> The task of occupancy forecasting (OCF) involves utilizing past and present perception data to predict future occupancy states of autonomous vehicle surrounding environments, which is critical for downstream tasks such as obstacle avoidance and path planning. Existing 3D OCF approaches struggle to predict plausible spatial details for movable objects and suffer from slow inference speeds due to ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14169v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14169v1-abstract-full" style="display: none;"> The task of occupancy forecasting (OCF) involves utilizing past and present perception data to predict future occupancy states of autonomous vehicle surrounding environments, which is critical for downstream tasks such as obstacle avoidance and path planning. Existing 3D OCF approaches struggle to predict plausible spatial details for movable objects and suffer from slow inference speeds due to neglecting the bias and uneven distribution of changing occupancy states in both space and time. In this paper, we propose a novel spatiotemporal decoupling vision-based paradigm to explicitly tackle the bias and achieve both effective and efficient 3D OCF. To tackle spatial bias in empty areas, we introduce a novel spatial representation that decouples the conventional dense 3D format into 2D bird's-eye view (BEV) occupancy with corresponding height values, enabling 3D OCF derived only from 2D predictions thus enhancing efficiency. To reduce temporal bias on static voxels, we design temporal decoupling to improve end-to-end OCF by temporally associating instances via predicted flows. We develop an efficient multi-head network EfficientOCF to achieve 3D OCF with our devised spatiotemporally decoupled representation. A new metric, conditional IoU (C-IoU), is also introduced to provide a robust 3D OCF performance assessment, especially in datasets with missing or incomplete annotations. The experimental results demonstrate that EfficientOCF surpasses existing baseline methods on accuracy and efficiency, achieving state-of-the-art performance with a fast inference time of 82.33ms with a single GPU. Our code will be released as open source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14169v1-abstract-full').style.display = 'none'; document.getElementById('2411.14169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13852">arXiv:2411.13852</a> <span> [<a href="https://arxiv.org/pdf/2411.13852">pdf</a>, <a href="https://arxiv.org/format/2411.13852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dealing with Synthetic Data Contamination in Online Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Maorong Wang</a>, <a href="/search/cs?searchtype=author&query=Michel%2C+N">Nicolas Michel</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jiafeng Mao</a>, <a href="/search/cs?searchtype=author&query=Yamasaki%2C+T">Toshihiko Yamasaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13852v1-abstract-short" style="display: inline;"> Image generation has shown remarkable results in generating high-fidelity realistic images, in particular with the advancement of diffusion-based models. However, the prevalence of AI-generated images may have side effects for the machine learning community that are not clearly identified. Meanwhile, the success of deep learning in computer vision is driven by the massive dataset collected on the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13852v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13852v1-abstract-full" style="display: none;"> Image generation has shown remarkable results in generating high-fidelity realistic images, in particular with the advancement of diffusion-based models. However, the prevalence of AI-generated images may have side effects for the machine learning community that are not clearly identified. Meanwhile, the success of deep learning in computer vision is driven by the massive dataset collected on the Internet. The extensive quantity of synthetic data being added to the Internet would become an obstacle for future researchers to collect "clean" datasets without AI-generated content. Prior research has shown that using datasets contaminated by synthetic images may result in performance degradation when used for training. In this paper, we investigate the potential impact of contaminated datasets on Online Continual Learning (CL) research. We experimentally show that contaminated datasets might hinder the training of existing online CL methods. Also, we propose Entropy Selection with Real-synthetic similarity Maximization (ESRM), a method to alleviate the performance deterioration caused by synthetic images when training online CL models. Experiments show that our method can significantly alleviate performance deterioration, especially when the contamination is severe. For reproducibility, the source code of our work is available at https://github.com/maorong-wang/ESRM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13852v1-abstract-full').style.display = 'none'; document.getElementById('2411.13852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS'24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13281">arXiv:2411.13281</a> <span> [<a href="https://arxiv.org/pdf/2411.13281">pdf</a>, <a href="https://arxiv.org/format/2411.13281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> VideoAutoArena: An Automated Arena for Evaluating Large Multimodal Models in Video Analysis through User Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Ziyang Luo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haoning Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongxu Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a>, <a href="/search/cs?searchtype=author&query=Kankanhalli%2C+M">Mohan Kankanhalli</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junnan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13281v1-abstract-short" style="display: inline;"> Large multimodal models (LMMs) with advanced video analysis capabilities have recently garnered significant attention. However, most evaluations rely on traditional methods like multiple-choice questions in benchmarks such as VideoMME and LongVideoBench, which are prone to lack the depth needed to capture the complex demands of real-world users. To address this limitation-and due to the prohibitiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13281v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13281v1-abstract-full" style="display: none;"> Large multimodal models (LMMs) with advanced video analysis capabilities have recently garnered significant attention. However, most evaluations rely on traditional methods like multiple-choice questions in benchmarks such as VideoMME and LongVideoBench, which are prone to lack the depth needed to capture the complex demands of real-world users. To address this limitation-and due to the prohibitive cost and slow pace of human annotation for video tasks-we introduce VideoAutoArena, an arena-style benchmark inspired by LMSYS Chatbot Arena's framework, designed to automatically assess LMMs' video analysis abilities. VideoAutoArena utilizes user simulation to generate open-ended, adaptive questions that rigorously assess model performance in video understanding. The benchmark features an automated, scalable evaluation framework, incorporating a modified ELO Rating System for fair and continuous comparisons across multiple LMMs. To validate our automated judging system, we construct a 'gold standard' using a carefully curated subset of human annotations, demonstrating that our arena strongly aligns with human judgment while maintaining scalability. Additionally, we introduce a fault-driven evolution strategy, progressively increasing question complexity to push models toward handling more challenging video analysis scenarios. Experimental results demonstrate that VideoAutoArena effectively differentiates among state-of-the-art LMMs, providing insights into model strengths and areas for improvement. To further streamline our evaluation, we introduce VideoAutoBench as an auxiliary benchmark, where human annotators label winners in a subset of VideoAutoArena battles. We use GPT-4o as a judge to compare responses against these human-validated answers. Together, VideoAutoArena and VideoAutoBench offer a cost-effective, and scalable framework for evaluating LMMs in user-centric video analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13281v1-abstract-full').style.display = 'none'; document.getElementById('2411.13281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://videoautoarena.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12201">arXiv:2411.12201</a> <span> [<a href="https://arxiv.org/pdf/2411.12201">pdf</a>, <a href="https://arxiv.org/format/2411.12201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Invariant Shape Representation Learning For Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hossain%2C+T">Tonmoy Hossain</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miaomiao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12201v1-abstract-short" style="display: inline;"> Geometric shape features have been widely used as strong predictors for image classification. Nevertheless, most existing classifiers such as deep neural networks (DNNs) directly leverage the statistical correlations between these shape features and target variables. However, these correlations can often be spurious and unstable across different environments (e.g., in different age groups, certain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12201v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12201v1-abstract-full" style="display: none;"> Geometric shape features have been widely used as strong predictors for image classification. Nevertheless, most existing classifiers such as deep neural networks (DNNs) directly leverage the statistical correlations between these shape features and target variables. However, these correlations can often be spurious and unstable across different environments (e.g., in different age groups, certain types of brain changes have unstable relations with neurodegenerative disease); hence leading to biased or inaccurate predictions. In this paper, we introduce a novel framework that for the first time develops invariant shape representation learning (ISRL) to further strengthen the robustness of image classifiers. In contrast to existing approaches that mainly derive features in the image space, our model ISRL is designed to jointly capture invariant features in latent shape spaces parameterized by deformable transformations. To achieve this goal, we develop a new learning paradigm based on invariant risk minimization (IRM) to learn invariant representations of image and shape features across multiple training distributions/environments. By embedding the features that are invariant with regard to target variables in different environments, our model consistently offers more accurate predictions. We validate our method by performing classification tasks on both simulated 2D images, real 3D brain and cine cardiovascular magnetic resonance images (MRIs). Our code is publicly available at https://github.com/tonmoy-hossain/ISRL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12201v1-abstract-full').style.display = 'none'; document.getElementById('2411.12201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12156">arXiv:2411.12156</a> <span> [<a href="https://arxiv.org/pdf/2411.12156">pdf</a>, <a href="https://arxiv.org/format/2411.12156">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HNCSE: Advancing Sentence Embeddings via Hybrid Contrastive Learning with Hard Negatives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenxiao Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zihong Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaozhuo Li</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Z">Zijin Hong</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianfeng Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiquan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Litian Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Feiran Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12156v1-abstract-short" style="display: inline;"> Unsupervised sentence representation learning remains a critical challenge in modern natural language processing (NLP) research. Recently, contrastive learning techniques have achieved significant success in addressing this issue by effectively capturing textual semantics. Many such approaches prioritize the optimization using negative samples. In fields such as computer vision, hard negative samp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12156v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12156v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12156v1-abstract-full" style="display: none;"> Unsupervised sentence representation learning remains a critical challenge in modern natural language processing (NLP) research. Recently, contrastive learning techniques have achieved significant success in addressing this issue by effectively capturing textual semantics. Many such approaches prioritize the optimization using negative samples. In fields such as computer vision, hard negative samples (samples that are close to the decision boundary and thus more difficult to distinguish) have been shown to enhance representation learning. However, adapting hard negatives to contrastive sentence learning is complex due to the intricate syntactic and semantic details of text. To address this problem, we propose HNCSE, a novel contrastive learning framework that extends the leading SimCSE approach. The hallmark of HNCSE is its innovative use of hard negative samples to enhance the learning of both positive and negative samples, thereby achieving a deeper semantic understanding. Empirical tests on semantic textual similarity and transfer task datasets validate the superiority of HNCSE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12156v1-abstract-full').style.display = 'none'; document.getElementById('2411.12156v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10708">arXiv:2411.10708</a> <span> [<a href="https://arxiv.org/pdf/2411.10708">pdf</a>, <a href="https://arxiv.org/format/2411.10708">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AllRestorer: All-in-One Transformer for Image Restoration under Composite Degradations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jiawei Mao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xuesong Yin</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+L">Ling Shao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hao Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10708v1-abstract-short" style="display: inline;"> Image restoration models often face the simultaneous interaction of multiple degradations in real-world scenarios. Existing approaches typically handle single or composite degradations based on scene descriptors derived from text or image embeddings. However, due to the varying proportions of different degradations within an image, these scene descriptors may not accurately differentiate between d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10708v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10708v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10708v1-abstract-full" style="display: none;"> Image restoration models often face the simultaneous interaction of multiple degradations in real-world scenarios. Existing approaches typically handle single or composite degradations based on scene descriptors derived from text or image embeddings. However, due to the varying proportions of different degradations within an image, these scene descriptors may not accurately differentiate between degradations, leading to suboptimal restoration in practical applications. To address this issue, we propose a novel Transformer-based restoration framework, AllRestorer. In AllRestorer, we enable the model to adaptively consider all image impairments, thereby avoiding errors from scene descriptor misdirection. Specifically, we introduce an All-in-One Transformer Block (AiOTB), which adaptively removes all degradations present in a given image by modeling the relationships between all degradations and the image embedding in latent space. To accurately address different variations potentially present within the same type of degradation and minimize ambiguity, AiOTB utilizes a composite scene descriptor consisting of both image and text embeddings to define the degradation. Furthermore, AiOTB includes an adaptive weight for each degradation, allowing for precise control of the restoration intensity. By leveraging AiOTB, AllRestorer avoids misdirection caused by inaccurate scene descriptors, achieving a 5.00 dB increase in PSNR compared to the baseline on the CDD-11 dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10708v1-abstract-full').style.display = 'none'; document.getElementById('2411.10708v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10492">arXiv:2411.10492</a> <span> [<a href="https://arxiv.org/pdf/2411.10492">pdf</a>, <a href="https://arxiv.org/format/2411.10492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MFP3D: Monocular Food Portion Estimation Leveraging 3D Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jinge Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Vinod%2C+G">Gautham Vinod</a>, <a href="/search/cs?searchtype=author&query=Raghavan%2C+S">Siddeshwar Raghavan</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiangpeng He</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+F">Fengqing Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10492v1-abstract-short" style="display: inline;"> Food portion estimation is crucial for monitoring health and tracking dietary intake. Image-based dietary assessment, which involves analyzing eating occasion images using computer vision techniques, is increasingly replacing traditional methods such as 24-hour recalls. However, accurately estimating the nutritional content from images remains challenging due to the loss of 3D information when pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10492v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10492v1-abstract-full" style="display: none;"> Food portion estimation is crucial for monitoring health and tracking dietary intake. Image-based dietary assessment, which involves analyzing eating occasion images using computer vision techniques, is increasingly replacing traditional methods such as 24-hour recalls. However, accurately estimating the nutritional content from images remains challenging due to the loss of 3D information when projecting to the 2D image plane. Existing portion estimation methods are challenging to deploy in real-world scenarios due to their reliance on specific requirements, such as physical reference objects, high-quality depth information, or multi-view images and videos. In this paper, we introduce MFP3D, a new framework for accurate food portion estimation using only a single monocular image. Specifically, MFP3D consists of three key modules: (1) a 3D Reconstruction Module that generates a 3D point cloud representation of the food from the 2D image, (2) a Feature Extraction Module that extracts and concatenates features from both the 3D point cloud and the 2D RGB image, and (3) a Portion Regression Module that employs a deep regression model to estimate the food's volume and energy content based on the extracted features. Our MFP3D is evaluated on MetaFood3D dataset, demonstrating its significant improvement in accurate portion estimation over existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10492v1-abstract-full').style.display = 'none'; document.getElementById('2411.10492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9th International Workshop on Multimedia Assisted Dietary Management, in conjunction with the 27th International Conference on Pattern Recognition (ICPR2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09627">arXiv:2411.09627</a> <span> [<a href="https://arxiv.org/pdf/2411.09627">pdf</a>, <a href="https://arxiv.org/format/2411.09627">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> One-Shot Manipulation Strategy Learning by Making Contact Analogies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuyao Liu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jiayuan Mao</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J">Joshua Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Lozano-P%C3%A9rez%2C+T">Tom谩s Lozano-P茅rez</a>, <a href="/search/cs?searchtype=author&query=Kaelbling%2C+L+P">Leslie Pack Kaelbling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09627v1-abstract-short" style="display: inline;"> We present a novel approach, MAGIC (manipulation analogies for generalizable intelligent contacts), for one-shot learning of manipulation strategies with fast and extensive generalization to novel objects. By leveraging a reference action trajectory, MAGIC effectively identifies similar contact points and sequences of actions on novel objects to replicate a demonstrated strategy, such as using dif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09627v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09627v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09627v1-abstract-full" style="display: none;"> We present a novel approach, MAGIC (manipulation analogies for generalizable intelligent contacts), for one-shot learning of manipulation strategies with fast and extensive generalization to novel objects. By leveraging a reference action trajectory, MAGIC effectively identifies similar contact points and sequences of actions on novel objects to replicate a demonstrated strategy, such as using different hooks to retrieve distant objects of different shapes and sizes. Our method is based on a two-stage contact-point matching process that combines global shape matching using pretrained neural features with local curvature analysis to ensure precise and physically plausible contact points. We experiment with three tasks including scooping, hanging, and hooking objects. MAGIC demonstrates superior performance over existing methods, achieving significant improvements in runtime speed and generalization to different object categories. Website: https://magic-2024.github.io/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09627v1-abstract-full').style.display = 'none'; document.getElementById('2411.09627v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CoRL LEAP Workshop, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07965">arXiv:2411.07965</a> <span> [<a href="https://arxiv.org/pdf/2411.07965">pdf</a>, <a href="https://arxiv.org/format/2411.07965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From General to Specific: Utilizing General Hallucination to Benchmark Specific Role-Playing Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chuyi Kong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Ziyang Luo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongzhan Lin</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhiyuan Fan</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yaxin Fan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuxi Sun</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07965v2-abstract-short" style="display: inline;"> The advanced role-playing capabilities of Large Language Models (LLMs) have paved the way for developing Role-Playing Agents (RPAs). However, existing benchmarks in this domain, such as HPD and SocialBench face limitations like poor generalizability, implicit and inaccurate judgments, and the risk of model forgetting. To address the above issues, we propose an automatic, scalable, and generalizabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07965v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07965v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07965v2-abstract-full" style="display: none;"> The advanced role-playing capabilities of Large Language Models (LLMs) have paved the way for developing Role-Playing Agents (RPAs). However, existing benchmarks in this domain, such as HPD and SocialBench face limitations like poor generalizability, implicit and inaccurate judgments, and the risk of model forgetting. To address the above issues, we propose an automatic, scalable, and generalizable paradigm. Specifically, we construct a benchmark, SHARP, by extracting relations from a general knowledge graph and leveraging the inherent hallucination properties of RPAs to simulate interactions across roles. We employ ChatGPT for stance detection and define relationship hallucination along with three related metrics based on stance transfer. Extensive experiments validate the effectiveness and stability of our paradigm. Our findings further explore the factors influencing these metrics and discuss the trade-off between blind loyalty to relationships and adherence to facts in RPAs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07965v2-abstract-full').style.display = 'none'; document.getElementById('2411.07965v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Revise three typos in the abstract and methodology sections of the introduction</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07611">arXiv:2411.07611</a> <span> [<a href="https://arxiv.org/pdf/2411.07611">pdf</a>, <a href="https://arxiv.org/format/2411.07611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Clinical Reasoning through Knowledge-augmented Rationale Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuai Niu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Liang Bai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhihua Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yida Xu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yunya Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07611v1-abstract-short" style="display: inline;"> Clinical rationales play a pivotal role in accurate disease diagnosis; however, many models predominantly use discriminative methods and overlook the importance of generating supportive rationales. Rationale distillation is a process that transfers knowledge from large language models (LLMs) to smaller language models (SLMs), thereby enhancing the latter's ability to break down complex tasks. Desp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07611v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07611v1-abstract-full" style="display: none;"> Clinical rationales play a pivotal role in accurate disease diagnosis; however, many models predominantly use discriminative methods and overlook the importance of generating supportive rationales. Rationale distillation is a process that transfers knowledge from large language models (LLMs) to smaller language models (SLMs), thereby enhancing the latter's ability to break down complex tasks. Despite its benefits, rationale distillation alone is inadequate for addressing domain knowledge limitations in tasks requiring specialized expertise, such as disease diagnosis. Effectively embedding domain knowledge in SLMs poses a significant challenge. While current LLMs are primarily geared toward processing textual data, multimodal LLMs that incorporate time series data, especially electronic health records (EHRs), are still evolving. To tackle these limitations, we introduce ClinRaGen, an SLM optimized for multimodal rationale generation in disease diagnosis. ClinRaGen incorporates a unique knowledge-augmented attention mechanism to merge domain knowledge with time series EHR data, utilizing a stepwise rationale distillation strategy to produce both textual and time series-based clinical rationales. Our evaluations show that ClinRaGen markedly improves the SLM's capability to interpret multimodal EHR data and generate accurate clinical rationales, supporting more reliable disease diagnosis, advancing LLM applications in healthcare, and narrowing the performance divide between LLMs and SLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07611v1-abstract-full').style.display = 'none'; document.getElementById('2411.07611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages. 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07573">arXiv:2411.07573</a> <span> [<a href="https://arxiv.org/pdf/2411.07573">pdf</a>, <a href="https://arxiv.org/format/2411.07573">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Robotic Control Optimization Through Kernel Selection in Safe Bayesian Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Lihao Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaocong Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a>, <a href="/search/cs?searchtype=author&query=Vadakkepat%2C+P">Prahlad Vadakkepat</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07573v1-abstract-short" style="display: inline;"> Control system optimization has long been a fundamental challenge in robotics. While recent advancements have led to the development of control algorithms that leverage learning-based approaches, such as SafeOpt, to optimize single feedback controllers, scaling these methods to high-dimensional complex systems with multiple controllers remains an open problem. In this paper, we propose a novel lea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07573v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07573v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07573v1-abstract-full" style="display: none;"> Control system optimization has long been a fundamental challenge in robotics. While recent advancements have led to the development of control algorithms that leverage learning-based approaches, such as SafeOpt, to optimize single feedback controllers, scaling these methods to high-dimensional complex systems with multiple controllers remains an open problem. In this paper, we propose a novel learning-based control optimization method, which enhances the additive Gaussian process-based Safe Bayesian Optimization algorithm to efficiently tackle high-dimensional problems through kernel selection. We use PID controller optimization in drones as a representative example and test the method on Safe Control Gym, a benchmark designed for evaluating safe control techniques. We show that the proposed method provides a more efficient and optimal solution for high-dimensional control optimization problems, demonstrating significant improvements over existing techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07573v1-abstract-full').style.display = 'none'; document.getElementById('2411.07573v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2024 IEEE International Conference on Robotics and Biomimetics (ROBIO)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07551">arXiv:2411.07551</a> <span> [<a href="https://arxiv.org/pdf/2411.07551">pdf</a>, <a href="https://arxiv.org/format/2411.07551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SP-VIO: Robust and Efficient Filter-Based Visual Inertial Odometry with State Transformation Model and Pose-Only Visual Description </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+X">Xueyu Du</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+C">Chengjun Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lilian Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xinchan Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huaiyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Maosong Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenqi Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jun Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07551v1-abstract-short" style="display: inline;"> Due to the advantages of high computational efficiency and small memory requirements, filter-based visual inertial odometry (VIO) has a good application prospect in miniaturized and payload-constrained embedded systems. However, the filter-based method has the problem of insufficient accuracy. To this end, we propose the State transformation and Pose-only VIO (SP-VIO) by rebuilding the state and m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07551v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07551v1-abstract-full" style="display: none;"> Due to the advantages of high computational efficiency and small memory requirements, filter-based visual inertial odometry (VIO) has a good application prospect in miniaturized and payload-constrained embedded systems. However, the filter-based method has the problem of insufficient accuracy. To this end, we propose the State transformation and Pose-only VIO (SP-VIO) by rebuilding the state and measurement models, and considering further visual deprived conditions. In detail, we first proposed a system model based on the double state transformation extended Kalman filter (DST-EKF), which has been proven to have better observability and consistency than the models based on extended Kalman filter (EKF) and state transformation extended Kalman filter (ST-EKF). Secondly, to reduce the influence of linearization error caused by inaccurate 3D reconstruction, we adopt the Pose-only (PO) theory to decouple the measurement model from 3D features. Moreover, to deal with visual deprived conditions, we propose a double state transformation Rauch-Tung-Striebel (DST-RTS) backtracking method to optimize motion trajectories during visual interruption. Experiments on public (EuRoC, Tum-VI, KITTI) and personal datasets show that SP-VIO has better accuracy and efficiency than state-of-the-art (SOTA) VIO algorithms, and has better robustness under visual deprived conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07551v1-abstract-full').style.display = 'none'; document.getElementById('2411.07551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07022">arXiv:2411.07022</a> <span> [<a href="https://arxiv.org/pdf/2411.07022">pdf</a>, <a href="https://arxiv.org/format/2411.07022">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HeteroSample: Meta-path Guided Sampling for Heterogeneous Graph Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+A">Ao Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&query=Du%2C+R">Ruiying Du</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Cong Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yebo Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Teng Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianfeng Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07022v1-abstract-short" style="display: inline;"> The rapid expansion of Internet of Things (IoT) has resulted in vast, heterogeneous graphs that capture complex interactions among devices, sensors, and systems. Efficient analysis of these graphs is critical for deriving insights in IoT scenarios such as smart cities, industrial IoT, and intelligent transportation systems. However, the scale and diversity of IoT-generated data present significant… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07022v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07022v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07022v1-abstract-full" style="display: none;"> The rapid expansion of Internet of Things (IoT) has resulted in vast, heterogeneous graphs that capture complex interactions among devices, sensors, and systems. Efficient analysis of these graphs is critical for deriving insights in IoT scenarios such as smart cities, industrial IoT, and intelligent transportation systems. However, the scale and diversity of IoT-generated data present significant challenges, and existing methods often struggle with preserving the structural integrity and semantic richness of these complex graphs. Many current approaches fail to maintain the balance between computational efficiency and the quality of the insights generated, leading to potential loss of critical information necessary for accurate decision-making in IoT applications. We introduce HeteroSample, a novel sampling method designed to address these challenges by preserving the structural integrity, node and edge type distributions, and semantic patterns of IoT-related graphs. HeteroSample works by incorporating the novel top-leader selection, balanced neighborhood expansion, and meta-path guided sampling strategies. The key idea is to leverage the inherent heterogeneous structure and semantic relationships encoded by meta-paths to guide the sampling process. This approach ensures that the resulting subgraphs are representative of the original data while significantly reducing computational overhead. Extensive experiments demonstrate that HeteroSample outperforms state-of-the-art methods, achieving up to 15% higher F1 scores in tasks such as link prediction and node classification, while reducing runtime by 20%.These advantages make HeteroSample a transformative tool for scalable and accurate IoT applications, enabling more effective and efficient analysis of complex IoT systems, ultimately driving advancements in smart cities, industrial IoT, and beyond. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07022v1-abstract-full').style.display = 'none'; document.getElementById('2411.07022v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06991">arXiv:2411.06991</a> <span> [<a href="https://arxiv.org/pdf/2411.06991">pdf</a>, <a href="https://arxiv.org/format/2411.06991">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SIESEF-FusionNet: Spatial Inter-correlation Enhancement and Spatially-Embedded Feature Fusion Network for LiDAR Point Cloud Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiale Chen</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Fei Xia</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jianliang Mao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoping Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuanlin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06991v1-abstract-short" style="display: inline;"> The ambiguity at the boundaries of different semantic classes in point cloud semantic segmentation often leads to incorrect decisions in intelligent perception systems, such as autonomous driving. Hence, accurate delineation of the boundaries is crucial for improving safety in autonomous driving. A novel spatial inter-correlation enhancement and spatially-embedded feature fusion network (SIESEF-Fu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06991v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06991v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06991v1-abstract-full" style="display: none;"> The ambiguity at the boundaries of different semantic classes in point cloud semantic segmentation often leads to incorrect decisions in intelligent perception systems, such as autonomous driving. Hence, accurate delineation of the boundaries is crucial for improving safety in autonomous driving. A novel spatial inter-correlation enhancement and spatially-embedded feature fusion network (SIESEF-FusionNet) is proposed in this paper, enhancing spatial inter-correlation by combining inverse distance weighting and angular compensation to extract more beneficial spatial information without causing redundancy. Meanwhile, a new spatial adaptive pooling module is also designed, embedding enhanced spatial information into semantic features for strengthening the context-awareness of semantic features. Experimental results demonstrate that 83.7% mIoU and 97.8% OA are achieved by SIESEF-FusionNet on the Toronto3D dataset, with performance superior to other baseline methods. A value of 61.1% mIoU is reached on the semanticKITTI dataset, where a marked improvement in segmentation performance is observed. In addition, the effectiveness and plug-and-play capability of the proposed modules are further verified through ablation studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06991v1-abstract-full').style.display = 'none'; document.getElementById('2411.06991v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06895">arXiv:2411.06895</a> <span> [<a href="https://arxiv.org/pdf/2411.06895">pdf</a>, <a href="https://arxiv.org/format/2411.06895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> DynaShard: Secure and Adaptive Blockchain Sharding Protocol with Hybrid Consensus and Dynamic Shard Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+A">Ao Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+K">Kun He</a>, <a href="/search/cs?searchtype=author&query=Du%2C+R">Ruiying Du</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiahua Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Cong Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yebo Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Teng Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianfeng Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06895v1-abstract-short" style="display: inline;"> Blockchain sharding has emerged as a promising solution to the scalability challenges in traditional blockchain systems by partitioning the network into smaller, manageable subsets called shards. Despite its potential, existing sharding solutions face significant limitations in handling dynamic workloads, ensuring secure cross-shard transactions, and maintaining system integrity. To address these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06895v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06895v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06895v1-abstract-full" style="display: none;"> Blockchain sharding has emerged as a promising solution to the scalability challenges in traditional blockchain systems by partitioning the network into smaller, manageable subsets called shards. Despite its potential, existing sharding solutions face significant limitations in handling dynamic workloads, ensuring secure cross-shard transactions, and maintaining system integrity. To address these gaps, we propose DynaShard, a dynamic and secure cross-shard transaction processing mechanism designed to enhance blockchain sharding efficiency and security. DynaShard combines adaptive shard management, a hybrid consensus approach, plus an efficient state synchronization and dispute resolution protocol. Our performance evaluation, conducted using a robust experimental setup with real-world network conditions and transaction workloads, demonstrates DynaShard's superior throughput, reduced latency, and improved shard utilization compared to the FTBS method. Specifically, DynaShard achieves up to a 42.6% reduction in latency and a 78.77% improvement in shard utilization under high transaction volumes and varying cross-shard transaction ratios. These results highlight DynaShard's ability to outperform state-of-the-art sharding methods, ensuring scalable and resilient blockchain systems. We believe that DynaShard's innovative approach will significantly impact future developments in blockchain technology, paving the way for more efficient and secure distributed systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06895v1-abstract-full').style.display = 'none'; document.getElementById('2411.06895v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06638">arXiv:2411.06638</a> <span> [<a href="https://arxiv.org/pdf/2411.06638">pdf</a>, <a href="https://arxiv.org/format/2411.06638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Model Editing for LLMs4Code: How Far are We? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaopeng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shangwen Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shasha Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jie Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+B">Bin Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weimin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06638v1-abstract-short" style="display: inline;"> Large Language Models for Code (LLMs4Code) have been found to exhibit outstanding performance in the software engineering domain, especially the remarkable performance in coding tasks. However, even the most advanced LLMs4Code can inevitably contain incorrect or outdated code knowledge. Due to the high cost of training LLMs4Code, it is impractical to re-train the models for fixing these problemati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06638v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06638v1-abstract-full" style="display: none;"> Large Language Models for Code (LLMs4Code) have been found to exhibit outstanding performance in the software engineering domain, especially the remarkable performance in coding tasks. However, even the most advanced LLMs4Code can inevitably contain incorrect or outdated code knowledge. Due to the high cost of training LLMs4Code, it is impractical to re-train the models for fixing these problematic code knowledge. Model editing is a new technical field for effectively and efficiently correcting erroneous knowledge in LLMs, where various model editing techniques and benchmarks have been proposed recently. Despite that, a comprehensive study that thoroughly compares and analyzes the performance of the state-of-the-art model editing techniques for adapting the knowledge within LLMs4Code across various code-related tasks is notably absent. To bridge this gap, we perform the first systematic study on applying state-of-the-art model editing approaches to repair the inaccuracy of LLMs4Code. To that end, we introduce a benchmark named CLMEEval, which consists of two datasets, i.e., CoNaLa-Edit (CNLE) with 21K+ code generation samples and CodeSearchNet-Edit (CSNE) with 16K+ code summarization samples. With the help of CLMEEval, we evaluate six advanced model editing techniques on three LLMs4Code: CodeLlama (7B), CodeQwen1.5 (7B), and Stable-Code (3B). Our findings include that the external memorization-based GRACE approach achieves the best knowledge editing effectiveness and specificity (the editing does not influence untargeted knowledge), while generalization (whether the editing can generalize to other semantically-identical inputs) is a universal challenge for existing techniques. Furthermore, building on in-depth case analysis, we introduce an enhanced version of GRACE called A-GRACE, which incorporates contrastive learning to better capture the semantics of the inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06638v1-abstract-full').style.display = 'none'; document.getElementById('2411.06638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICSE2025. The code is available at: https://github.com/xpq-tech/code-llmedit.git</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06221">arXiv:2411.06221</a> <span> [<a href="https://arxiv.org/pdf/2411.06221">pdf</a>, <a href="https://arxiv.org/format/2411.06221">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Smart-LLaMA: Two-Stage Post-Training of Large Language Models for Smart Contract Vulnerability Detection and Explanation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lei Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiqi Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hang Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhirong Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chenjie Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fengjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Li Yang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiajia Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06221v1-abstract-short" style="display: inline;"> With the rapid development of blockchain technology, smart contract security has become a critical challenge. Existing smart contract vulnerability detection methods face three main issues: (1) Insufficient quality of datasets, lacking detailed explanations and precise vulnerability locations. (2) Limited adaptability of large language models (LLMs) to the smart contract domain, as most LLMs are p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06221v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06221v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06221v1-abstract-full" style="display: none;"> With the rapid development of blockchain technology, smart contract security has become a critical challenge. Existing smart contract vulnerability detection methods face three main issues: (1) Insufficient quality of datasets, lacking detailed explanations and precise vulnerability locations. (2) Limited adaptability of large language models (LLMs) to the smart contract domain, as most LLMs are pre-trained on general text data but minimal smart contract-specific data. (3) Lack of high-quality explanations for detected vulnerabilities, as existing methods focus solely on detection without clear explanations. These limitations hinder detection performance and make it harder for developers to understand and fix vulnerabilities quickly, potentially leading to severe financial losses. To address these problems, we propose Smart-LLaMA, an advanced detection method based on the LLaMA language model. First, we construct a comprehensive dataset covering four vulnerability types with labels, detailed explanations, and precise vulnerability locations. Second, we introduce Smart Contract-Specific Continual Pre-Training, using raw smart contract data to enable the LLM to learn smart contract syntax and semantics, enhancing their domain adaptability. Furthermore, we propose Explanation-Guided Fine-Tuning, which fine-tunes the LLM using paired vulnerable code and explanations, enabling both vulnerability detection and reasoned explanations. We evaluate explanation quality through LLM and human evaluation, focusing on Correctness, Completeness, and Conciseness. Experimental results show that Smart-LLaMA outperforms state-of-the-art baselines, with average improvements of 6.49% in F1 score and 3.78% in accuracy, while providing reliable explanations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06221v1-abstract-full').style.display = 'none'; document.getElementById('2411.06221v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05492">arXiv:2411.05492</a> <span> [<a href="https://arxiv.org/pdf/2411.05492">pdf</a>, <a href="https://arxiv.org/ps/2411.05492">ps</a>, <a href="https://arxiv.org/format/2411.05492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Covariance-Based Device Activity Detection with Massive MIMO for Near-Field Correlated Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyue Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Ya-Feng Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Junjie Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05492v1-abstract-short" style="display: inline;"> This paper studies the device activity detection problem in a massive multiple-input multiple-output (MIMO) system for near-field communications (NFC). In this system, active devices transmit their signature sequences to the base station (BS), which detects the active devices based on the received signal. In this paper, we model the near-field channels as correlated Rician fading channels and form… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05492v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05492v1-abstract-full" style="display: none;"> This paper studies the device activity detection problem in a massive multiple-input multiple-output (MIMO) system for near-field communications (NFC). In this system, active devices transmit their signature sequences to the base station (BS), which detects the active devices based on the received signal. In this paper, we model the near-field channels as correlated Rician fading channels and formulate the device activity detection problem as a maximum likelihood estimation (MLE) problem. Compared to the traditional uncorrelated channel model, the correlation of channels complicates both algorithm design and theoretical analysis of the MLE problem. On the algorithmic side, we propose two computationally efficient algorithms for solving the MLE problem: an exact coordinate descent (CD) algorithm and an inexact CD algorithm. The exact CD algorithm solves the one-dimensional optimization subproblem exactly using matrix eigenvalue decomposition and polynomial root-finding. By approximating the objective function appropriately, the inexact CD algorithm solves the one-dimensional optimization subproblem inexactly with lower complexity and more robust numerical performance. Additionally, we analyze the detection performance of the MLE problem under correlated channels by comparing it with the case of uncorrelated channels. The analysis shows that when the overall number of devices $N$ is large or the signature sequence length $L$ is small, the detection performance of MLE under correlated channels tends to be better than that under uncorrelated channels. Conversely, when $N$ is small or $L$ is large, MLE performs better under uncorrelated channels than under correlated ones. Simulation results demonstrate the computational efficiency of the proposed algorithms and verify the correctness of the analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05492v1-abstract-full').style.display = 'none'; document.getElementById('2411.05492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 8 figures, submitted for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05383">arXiv:2411.05383</a> <span> [<a href="https://arxiv.org/pdf/2411.05383">pdf</a>, <a href="https://arxiv.org/format/2411.05383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Low-Resource Harmful Meme Detection with LMM Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianzhao Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongzhan Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziyan Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Ziyang Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guang Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05383v1-abstract-short" style="display: inline;"> The proliferation of Internet memes in the age of social media necessitates effective identification of harmful ones. Due to the dynamic nature of memes, existing data-driven models may struggle in low-resource scenarios where only a few labeled examples are available. In this paper, we propose an agency-driven framework for low-resource harmful meme detection, employing both outward and inward an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05383v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05383v1-abstract-full" style="display: none;"> The proliferation of Internet memes in the age of social media necessitates effective identification of harmful ones. Due to the dynamic nature of memes, existing data-driven models may struggle in low-resource scenarios where only a few labeled examples are available. In this paper, we propose an agency-driven framework for low-resource harmful meme detection, employing both outward and inward analysis with few-shot annotated samples. Inspired by the powerful capacity of Large Multimodal Models (LMMs) on multimodal reasoning, we first retrieve relative memes with annotations to leverage label information as auxiliary signals for the LMM agent. Then, we elicit knowledge-revising behavior within the LMM agent to derive well-generalized insights into meme harmfulness. By combining these strategies, our approach enables dialectical reasoning over intricate and implicit harm-indicative patterns. Extensive experiments conducted on three meme datasets demonstrate that our proposed approach achieves superior performance than state-of-the-art methods on the low-resource harmful meme detection task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05383v1-abstract-full').style.display = 'none'; document.getElementById('2411.05383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05123">arXiv:2411.05123</a> <span> [<a href="https://arxiv.org/pdf/2411.05123">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Friction tunable electrostatic clutch with low driving voltage for kinesthetic haptic feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nam%2C+J">Jongseok Nam</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jihyeong Ma</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+N+H">Nak Hyeong Lee</a>, <a href="/search/cs?searchtype=author&query=Kyung%2C+K">Ki-Uk Kyung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05123v1-abstract-short" style="display: inline;"> As interest in Virtual Reality (VR) and Augmented Reality (AR) increases, the demand for kinesthetic haptic feedback devices is rapidly rising. Motor based haptic interfaces are heavy and bulky, leading to discomfort for the user. To address this issue, haptic gloves based on electrostatic clutches that offer fast response times and a thin form factor are being researched. However, high operating… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05123v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05123v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05123v1-abstract-full" style="display: none;"> As interest in Virtual Reality (VR) and Augmented Reality (AR) increases, the demand for kinesthetic haptic feedback devices is rapidly rising. Motor based haptic interfaces are heavy and bulky, leading to discomfort for the user. To address this issue, haptic gloves based on electrostatic clutches that offer fast response times and a thin form factor are being researched. However, high operating voltages and variable force control remain challenges to overcome. Electrostatic clutches utilizing functional polymers with charge accumulation properties and dielectric liquid can generate the frictional shear stress over a wide range from 0.35 N/cm$^2$ to 18.9 N/cm$^2$ at low voltages below 100 V. Based on this, the haptic glove generates a high blocking force and is comfortable to wear. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05123v1-abstract-full').style.display = 'none'; document.getElementById('2411.05123v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Part of proceedings of 6th International Conference AsiaHaptics 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04549">arXiv:2411.04549</a> <span> [<a href="https://arxiv.org/pdf/2411.04549">pdf</a>, <a href="https://arxiv.org/format/2411.04549">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Vision Language Models are In-Context Value Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y+J">Yecheng Jason Ma</a>, <a href="/search/cs?searchtype=author&query=Hejna%2C+J">Joey Hejna</a>, <a href="/search/cs?searchtype=author&query=Wahid%2C+A">Ayzaan Wahid</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chuyuan Fu</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+D">Dhruv Shah</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jacky Liang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhuo Xu</a>, <a href="/search/cs?searchtype=author&query=Kirmani%2C+S">Sean Kirmani</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Peng Xu</a>, <a href="/search/cs?searchtype=author&query=Driess%2C+D">Danny Driess</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Ted Xiao</a>, <a href="/search/cs?searchtype=author&query=Tompson%2C+J">Jonathan Tompson</a>, <a href="/search/cs?searchtype=author&query=Bastani%2C+O">Osbert Bastani</a>, <a href="/search/cs?searchtype=author&query=Jayaraman%2C+D">Dinesh Jayaraman</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tingnan Zhang</a>, <a href="/search/cs?searchtype=author&query=Sadigh%2C+D">Dorsa Sadigh</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Fei Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04549v1-abstract-short" style="display: inline;"> Predicting temporal progress from visual trajectories is important for intelligent robots that can learn, adapt, and improve. However, learning such progress estimator, or temporal value function, across different tasks and domains requires both a large amount of diverse data and methods which can scale and generalize. To address these challenges, we present Generative Value Learning (\GVL), a uni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04549v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04549v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04549v1-abstract-full" style="display: none;"> Predicting temporal progress from visual trajectories is important for intelligent robots that can learn, adapt, and improve. However, learning such progress estimator, or temporal value function, across different tasks and domains requires both a large amount of diverse data and methods which can scale and generalize. To address these challenges, we present Generative Value Learning (\GVL), a universal value function estimator that leverages the world knowledge embedded in vision-language models (VLMs) to predict task progress. Naively asking a VLM to predict values for a video sequence performs poorly due to the strong temporal correlation between successive frames. Instead, GVL poses value estimation as a temporal ordering problem over shuffled video frames; this seemingly more challenging task encourages VLMs to more fully exploit their underlying semantic and temporal grounding capabilities to differentiate frames based on their perceived task progress, consequently producing significantly better value predictions. Without any robot or task specific training, GVL can in-context zero-shot and few-shot predict effective values for more than 300 distinct real-world tasks across diverse robot platforms, including challenging bimanual manipulation tasks. Furthermore, we demonstrate that GVL permits flexible multi-modal in-context learning via examples from heterogeneous tasks and embodiments, such as human videos. The generality of GVL enables various downstream applications pertinent to visuomotor policy learning, including dataset filtering, success detection, and advantage-weighted regression -- all without any model training or finetuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04549v1-abstract-full').style.display = 'none'; document.getElementById('2411.04549v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website and demo: https://generative-value-learning.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04539">arXiv:2411.04539</a> <span> [<a href="https://arxiv.org/pdf/2411.04539">pdf</a>, <a href="https://arxiv.org/format/2411.04539">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Best Practices for Distilling Large Language Models into BERT for Web Search Ranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+D">Dezhi Ye</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junwei Hu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jiabin Fan</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+B">Bowen Tian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Haijin Liang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04539v1-abstract-short" style="display: inline;"> Recent studies have highlighted the significant potential of Large Language Models (LLMs) as zero-shot relevance rankers. These methods predominantly utilize prompt learning to assess the relevance between queries and documents by generating a ranked list of potential documents. Despite their promise, the substantial costs associated with LLMs pose a significant challenge for their direct implemen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04539v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04539v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04539v1-abstract-full" style="display: none;"> Recent studies have highlighted the significant potential of Large Language Models (LLMs) as zero-shot relevance rankers. These methods predominantly utilize prompt learning to assess the relevance between queries and documents by generating a ranked list of potential documents. Despite their promise, the substantial costs associated with LLMs pose a significant challenge for their direct implementation in commercial search systems. To overcome this barrier and fully exploit the capabilities of LLMs for text ranking, we explore techniques to transfer the ranking expertise of LLMs to a more compact model similar to BERT, using a ranking loss to enable the deployment of less resource-intensive models. Specifically, we enhance the training of LLMs through Continued Pre-Training, taking the query as input and the clicked title and summary as output. We then proceed with supervised fine-tuning of the LLM using a rank loss, assigning the final token as a representative of the entire sentence. Given the inherent characteristics of autoregressive language models, only the final token </s> can encapsulate all preceding tokens. Additionally, we introduce a hybrid point-wise and margin MSE loss to transfer the ranking knowledge from LLMs to smaller models like BERT. This method creates a viable solution for environments with strict resource constraints. Both offline and online evaluations have confirmed the efficacy of our approach, and our model has been successfully integrated into a commercial web search engine as of February 2024. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04539v1-abstract-full').style.display = 'none'; document.getElementById('2411.04539v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Arxiv Version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03884">arXiv:2411.03884</a> <span> [<a href="https://arxiv.org/pdf/2411.03884">pdf</a>, <a href="https://arxiv.org/format/2411.03884">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Polynomial Composition Activations: Unleashing the Dynamics of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuo%2C+Z">Zhijian Zhuo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Ya Wang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yutao Zeng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoqing Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xun Zhou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jinwen Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03884v1-abstract-short" style="display: inline;"> Transformers have found extensive applications across various domains due to the powerful fitting capabilities. This success can be partially attributed to their inherent nonlinearity. Thus, in addition to the ReLU function employed in the original transformer architecture, researchers have explored alternative modules such as GeLU and SwishGLU to enhance nonlinearity and thereby augment represent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03884v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03884v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03884v1-abstract-full" style="display: none;"> Transformers have found extensive applications across various domains due to the powerful fitting capabilities. This success can be partially attributed to their inherent nonlinearity. Thus, in addition to the ReLU function employed in the original transformer architecture, researchers have explored alternative modules such as GeLU and SwishGLU to enhance nonlinearity and thereby augment representational capacity. In this paper, we propose a novel category of polynomial composition activations (PolyCom), designed to optimize the dynamics of transformers. Theoretically, we provide a comprehensive mathematical analysis of PolyCom, highlighting its enhanced expressivity and efficacy relative to other activation functions. Notably, we demonstrate that networks incorporating PolyCom achieve the $\textbf{optimal approximation rate}$, indicating that PolyCom networks require minimal parameters to approximate general smooth functions in Sobolev spaces. We conduct empirical experiments on the pre-training configurations of large language models (LLMs), including both dense and sparse architectures. By substituting conventional activation functions with PolyCom, we enable LLMs to capture higher-order interactions within the data, thus improving performance metrics in terms of accuracy and convergence rates. Extensive experimental results demonstrate the effectiveness of our method, showing substantial improvements over other activation functions. Code is available at https://github.com/BryceZhuo/PolyCom. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03884v1-abstract-full').style.display = 'none'; document.getElementById('2411.03884v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03471">arXiv:2411.03471</a> <span> [<a href="https://arxiv.org/pdf/2411.03471">pdf</a>, <a href="https://arxiv.org/format/2411.03471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MetRex: A Benchmark for Verilog Code Metric Reasoning Using LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abdelatty%2C+M">Manar Abdelatty</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jingxiao Ma</a>, <a href="/search/cs?searchtype=author&query=Reda%2C+S">Sherief Reda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03471v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have been applied to various hardware design tasks, including Verilog code generation, EDA tool scripting, and RTL bug fixing. Despite this extensive exploration, LLMs are yet to be used for the task of post-synthesis metric reasoning and estimation of HDL designs. In this paper, we assess the ability of LLMs to reason about post-synthesis metrics of Verilog designs. W… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03471v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03471v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have been applied to various hardware design tasks, including Verilog code generation, EDA tool scripting, and RTL bug fixing. Despite this extensive exploration, LLMs are yet to be used for the task of post-synthesis metric reasoning and estimation of HDL designs. In this paper, we assess the ability of LLMs to reason about post-synthesis metrics of Verilog designs. We introduce MetRex, a large-scale dataset comprising 25,868 Verilog HDL designs and their corresponding post-synthesis metrics, namely area, delay, and static power. MetRex incorporates a Chain of Thought (CoT) template to enhance LLMs' reasoning about these metrics. Extensive experiments show that Supervised Fine-Tuning (SFT) boosts the LLM's reasoning capabilities on average by 37.0\%, 25.3\%, and 25.7\% on the area, delay, and static power, respectively. While SFT improves performance on our benchmark, it remains far from achieving optimal results, especially on complex problems. Comparing to state-of-the-art regression models, our approach delivers accurate post-synthesis predictions for 17.4\% more designs (within a 5\% error margin), in addition to offering a 1.7x speedup by eliminating the need for pre-processing. This work lays the groundwork for advancing LLM-based Verilog code metric reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03471v1-abstract-full').style.display = 'none'; document.getElementById('2411.03471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01775">arXiv:2411.01775</a> <span> [<a href="https://arxiv.org/pdf/2411.01775">pdf</a>, <a href="https://arxiv.org/format/2411.01775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Eurekaverse: Environment Curriculum Generation via Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+W">William Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sam Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hung-Ju Wang</a>, <a href="/search/cs?searchtype=author&query=Bastani%2C+O">Osbert Bastani</a>, <a href="/search/cs?searchtype=author&query=Jayaraman%2C+D">Dinesh Jayaraman</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y+J">Yecheng Jason Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01775v1-abstract-short" style="display: inline;"> Recent work has demonstrated that a promising strategy for teaching robots a wide range of complex skills is by training them on a curriculum of progressively more challenging environments. However, developing an effective curriculum of environment distributions currently requires significant expertise, which must be repeated for every new domain. Our key insight is that environments are often nat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01775v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01775v1-abstract-full" style="display: none;"> Recent work has demonstrated that a promising strategy for teaching robots a wide range of complex skills is by training them on a curriculum of progressively more challenging environments. However, developing an effective curriculum of environment distributions currently requires significant expertise, which must be repeated for every new domain. Our key insight is that environments are often naturally represented as code. Thus, we probe whether effective environment curriculum design can be achieved and automated via code generation by large language models (LLM). In this paper, we introduce Eurekaverse, an unsupervised environment design algorithm that uses LLMs to sample progressively more challenging, diverse, and learnable environments for skill training. We validate Eurekaverse's effectiveness in the domain of quadrupedal parkour learning, in which a quadruped robot must traverse through a variety of obstacle courses. The automatic curriculum designed by Eurekaverse enables gradual learning of complex parkour skills in simulation and can successfully transfer to the real-world, outperforming manual training courses designed by humans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01775v1-abstract-full').style.display = 'none'; document.getElementById('2411.01775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Conference on Robot Learning (CoRL), 2024. Project website and code: https://eureka-research.github.io/eurekaverse</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01218">arXiv:2411.01218</a> <span> [<a href="https://arxiv.org/pdf/2411.01218">pdf</a>, <a href="https://arxiv.org/format/2411.01218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Spatio-Temporal Reconstruction of Dynamic Endoscopic Scenes with 4D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+F">Fengze Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jishuai He</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jieming Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhijing Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01218v1-abstract-short" style="display: inline;"> Dynamic scene reconstruction is essential in robotic minimally invasive surgery, providing crucial spatial information that enhances surgical precision and outcomes. However, existing methods struggle to address the complex, temporally dynamic nature of endoscopic scenes. This paper presents ST-Endo4DGS, a novel framework that models the spatio-temporal volume of dynamic endoscopic scenes using un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01218v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01218v1-abstract-full" style="display: none;"> Dynamic scene reconstruction is essential in robotic minimally invasive surgery, providing crucial spatial information that enhances surgical precision and outcomes. However, existing methods struggle to address the complex, temporally dynamic nature of endoscopic scenes. This paper presents ST-Endo4DGS, a novel framework that models the spatio-temporal volume of dynamic endoscopic scenes using unbiased 4D Gaussian Splatting (4DGS) primitives, parameterized by anisotropic ellipses with flexible 4D rotations. This approach enables precise representation of deformable tissue dynamics, capturing intricate spatial and temporal correlations in real time. Additionally, we extend spherindrical harmonics to represent time-evolving appearance, achieving realistic adaptations to lighting and view changes. A new endoscopic normal alignment constraint (ENAC) further enhances geometric fidelity by aligning rendered normals with depth-derived geometry. Extensive evaluations show that ST-Endo4DGS outperforms existing methods in both visual quality and real-time performance, establishing a new state-of-the-art in dynamic scene reconstruction for endoscopic surgery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01218v1-abstract-full').style.display = 'none'; document.getElementById('2411.01218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00476">arXiv:2411.00476</a> <span> [<a href="https://arxiv.org/pdf/2411.00476">pdf</a>, <a href="https://arxiv.org/format/2411.00476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> PlanScope: Learning to Plan Within Decision Scope Does Matter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xin%2C+R">Ren Xin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jie Cheng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00476v1-abstract-short" style="display: inline;"> In the context of autonomous driving, learning-based methods have been promising for the development of planning modules. During the training process of planning modules, directly minimizing the discrepancy between expert-driving logs and planning output is widely deployed. In general, driving logs consist of suddenly appearing obstacles or swiftly changing traffic signals, which typically necessi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00476v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00476v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00476v1-abstract-full" style="display: none;"> In the context of autonomous driving, learning-based methods have been promising for the development of planning modules. During the training process of planning modules, directly minimizing the discrepancy between expert-driving logs and planning output is widely deployed. In general, driving logs consist of suddenly appearing obstacles or swiftly changing traffic signals, which typically necessitate swift and nuanced adjustments in driving maneuvers. Concurrently, future trajectories of the vehicles exhibit their long-term decisions, such as adhering to a reference lane or circumventing stationary obstacles. Due to the unpredictable influence of future events in driving logs, reasoning bias could be naturally introduced to learning based planning modules, which leads to a possible degradation of driving performance. To address this issue, we identify the decisions and their corresponding time horizons, and characterize a so-called decision scope by retaining decisions within derivable horizons only, to mitigate the effect of irrational behaviors caused by unpredictable events. This framework employs wavelet transformation based log preprocessing with an effective loss computation approach, rendering the planning model only sensitive to valuable decisions at the current state. Since frequency domain characteristics are extracted in conjunction with time domain features by wavelets, decision information across various frequency bands within the corresponding time horizon can be suitably captured. Furthermore, to achieve valuable decision learning, this framework leverages a transformer based decoder that incrementally generates the detailed profiles of future decisions over multiple steps. Our experiments demonstrate that our proposed method outperforms baselines in terms of driving scores with closed-loop evaluations on the nuPlan dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00476v1-abstract-full').style.display = 'none'; document.getElementById('2411.00476v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23905">arXiv:2410.23905</a> <span> [<a href="https://arxiv.org/pdf/2410.23905">pdf</a>, <a href="https://arxiv.org/format/2410.23905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Text-DiFuse: An Interactive Multi-Modal Image Fusion Framework based on Text-modulated Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+L">Lei Cao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiayi Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23905v1-abstract-short" style="display: inline;"> Existing multi-modal image fusion methods fail to address the compound degradations presented in source images, resulting in fusion images plagued by noise, color bias, improper exposure, \textit{etc}. Additionally, these methods often overlook the specificity of foreground objects, weakening the salience of the objects of interest within the fused images. To address these challenges, this study p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23905v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23905v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23905v1-abstract-full" style="display: none;"> Existing multi-modal image fusion methods fail to address the compound degradations presented in source images, resulting in fusion images plagued by noise, color bias, improper exposure, \textit{etc}. Additionally, these methods often overlook the specificity of foreground objects, weakening the salience of the objects of interest within the fused images. To address these challenges, this study proposes a novel interactive multi-modal image fusion framework based on the text-modulated diffusion model, called Text-DiFuse. First, this framework integrates feature-level information integration into the diffusion process, allowing adaptive degradation removal and multi-modal information fusion. This is the first attempt to deeply and explicitly embed information fusion within the diffusion process, effectively addressing compound degradation in image fusion. Second, by embedding the combination of the text and zero-shot location model into the diffusion fusion process, a text-controlled fusion re-modulation strategy is developed. This enables user-customized text control to improve fusion performance and highlight foreground objects in the fused images. Extensive experiments on diverse public datasets show that our Text-DiFuse achieves state-of-the-art fusion performance across various scenarios with complex degradation. Moreover, the semantic segmentation experiment validates the significant enhancement in semantic performance achieved by our text-controlled fusion re-modulation strategy. The code is publicly available at https://github.com/Leiii-Cao/Text-DiFuse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23905v1-abstract-full').style.display = 'none'; document.getElementById('2410.23905v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23757">arXiv:2410.23757</a> <span> [<a href="https://arxiv.org/pdf/2410.23757">pdf</a>, <a href="https://arxiv.org/format/2410.23757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Identify Then Recommend: Towards Unsupervised Group Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jian Ma</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+W">Wenliang Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23757v1-abstract-short" style="display: inline;"> Group Recommendation (GR), which aims to recommend items to groups of users, has become a promising and practical direction for recommendation systems. This paper points out two issues of the state-of-the-art GR models. (1) The pre-defined and fixed number of user groups is inadequate for real-time industrial recommendation systems, where the group distribution can shift dynamically. (2) The train… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23757v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23757v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23757v1-abstract-full" style="display: none;"> Group Recommendation (GR), which aims to recommend items to groups of users, has become a promising and practical direction for recommendation systems. This paper points out two issues of the state-of-the-art GR models. (1) The pre-defined and fixed number of user groups is inadequate for real-time industrial recommendation systems, where the group distribution can shift dynamically. (2) The training schema of existing GR methods is supervised, necessitating expensive user-group and group-item labels, leading to significant annotation costs. To this end, we present a novel unsupervised group recommendation framework named \underline{I}dentify \underline{T}hen \underline{R}ecommend (\underline{ITR}), where it first identifies the user groups in an unsupervised manner even without the pre-defined number of groups, and then two pre-text tasks are designed to conduct self-supervised group recommendation. Concretely, at the group identification stage, we first estimate the adaptive density of each user point, where areas with higher densities are more likely to be recognized as group centers. Then, a heuristic merge-and-split strategy is designed to discover the user groups and decision boundaries. Subsequently, at the self-supervised learning stage, the pull-and-repulsion pre-text task is proposed to optimize the user-group distribution. Besides, the pseudo group recommendation pre-text task is designed to assist the recommendations. Extensive experiments demonstrate the superiority and effectiveness of ITR on both user recommendation (e.g., 22.22\% NDCG@5 $\uparrow$) and group recommendation (e.g., 22.95\% NDCG@5 $\uparrow$). Furthermore, we deploy ITR on the industrial recommender and achieve promising results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23757v1-abstract-full').style.display = 'none'; document.getElementById('2410.23757v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23254">arXiv:2410.23254</a> <span> [<a href="https://arxiv.org/pdf/2410.23254">pdf</a>, <a href="https://arxiv.org/format/2410.23254">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Keypoint Abstraction using Large Models for Object-Relative Imitation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xiaolin Fang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Bo-Ruei Huang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jiayuan Mao</a>, <a href="/search/cs?searchtype=author&query=Shone%2C+J">Jasmine Shone</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Lozano-P%C3%A9rez%2C+T">Tom谩s Lozano-P茅rez</a>, <a href="/search/cs?searchtype=author&query=Kaelbling%2C+L+P">Leslie Pack Kaelbling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23254v1-abstract-short" style="display: inline;"> Generalization to novel object configurations and instances across diverse tasks and environments is a critical challenge in robotics. Keypoint-based representations have been proven effective as a succinct representation for capturing essential object features, and for establishing a reference frame in action prediction, enabling data-efficient learning of robot skills. However, their manual desi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23254v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23254v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23254v1-abstract-full" style="display: none;"> Generalization to novel object configurations and instances across diverse tasks and environments is a critical challenge in robotics. Keypoint-based representations have been proven effective as a succinct representation for capturing essential object features, and for establishing a reference frame in action prediction, enabling data-efficient learning of robot skills. However, their manual design nature and reliance on additional human labels limit their scalability. In this paper, we propose KALM, a framework that leverages large pre-trained vision-language models (LMs) to automatically generate task-relevant and cross-instance consistent keypoints. KALM distills robust and consistent keypoints across views and objects by generating proposals using LMs and verifies them against a small set of robot demonstration data. Based on the generated keypoints, we can train keypoint-conditioned policy models that predict actions in keypoint-centric frames, enabling robots to generalize effectively across varying object poses, camera views, and object instances with similar functional shapes. Our method demonstrates strong performance in the real world, adapting to different tasks and environments from only a handful of demonstrations while requiring no additional labels. Website: https://kalm-il.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23254v1-abstract-full').style.display = 'none'; document.getElementById('2410.23254v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CoRL LangRob Workshop, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22995">arXiv:2410.22995</a> <span> [<a href="https://arxiv.org/pdf/2410.22995">pdf</a>, <a href="https://arxiv.org/format/2410.22995">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VisAidMath: Benchmarking Visual-Aided Mathematical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jingkun Ma</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+R">Runzhe Zhan</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+D+F">Derek F. Wong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Di Sun</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+H+P">Hou Pong Chan</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+L+S">Lidia S. Chao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22995v1-abstract-short" style="display: inline;"> Although previous research on large language models (LLMs) and large multi-modal models (LMMs) has systematically explored mathematical problem-solving (MPS) within visual contexts, the analysis of how these models process visual information during problem-solving remains insufficient. To address this gap, we present VisAidMath, a benchmark for evaluating the MPS process related to visual informat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22995v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22995v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22995v1-abstract-full" style="display: none;"> Although previous research on large language models (LLMs) and large multi-modal models (LMMs) has systematically explored mathematical problem-solving (MPS) within visual contexts, the analysis of how these models process visual information during problem-solving remains insufficient. To address this gap, we present VisAidMath, a benchmark for evaluating the MPS process related to visual information. We follow a rigorous data curation pipeline involving both automated processes and manual annotations to ensure data quality and reliability. Consequently, this benchmark includes 1,200 challenging problems from various mathematical branches, vision-aid formulations, and difficulty levels, collected from diverse sources such as textbooks, examination papers, and Olympiad problems. Based on the proposed benchmark, we conduct comprehensive evaluations on ten mainstream LLMs and LMMs, highlighting deficiencies in the visual-aided reasoning process. For example, GPT-4V only achieves 45.33% accuracy in the visual-aided reasoning task, even with a drop of 2 points when provided with golden visual aids. In-depth analysis reveals that the main cause of deficiencies lies in hallucination regarding the implicit visual reasoning process, shedding light on future research directions in the visual-aided MPS process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22995v1-abstract-full').style.display = 'none'; document.getElementById('2410.22995v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">58 pages, 28 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22830">arXiv:2410.22830</a> <span> [<a href="https://arxiv.org/pdf/2410.22830">pdf</a>, <a href="https://arxiv.org/format/2410.22830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Latent Diffusion, Implicit Amplification: Efficient Continuous-Scale Super-Resolution for Remote Sensing Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hanlin Wu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+J">Jiangwei Mo</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiaohui Sun</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jie Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22830v1-abstract-short" style="display: inline;"> Recent advancements in diffusion models have significantly improved performance in super-resolution (SR) tasks. However, previous research often overlooks the fundamental differences between SR and general image generation. General image generation involves creating images from scratch, while SR focuses specifically on enhancing existing low-resolution (LR) images by adding typically missing high-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22830v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22830v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22830v1-abstract-full" style="display: none;"> Recent advancements in diffusion models have significantly improved performance in super-resolution (SR) tasks. However, previous research often overlooks the fundamental differences between SR and general image generation. General image generation involves creating images from scratch, while SR focuses specifically on enhancing existing low-resolution (LR) images by adding typically missing high-frequency details. This oversight not only increases the training difficulty but also limits their inference efficiency. Furthermore, previous diffusion-based SR methods are typically trained and inferred at fixed integer scale factors, lacking flexibility to meet the needs of up-sampling with non-integer scale factors. To address these issues, this paper proposes an efficient and elastic diffusion-based SR model (E$^2$DiffSR), specially designed for continuous-scale SR in remote sensing imagery. E$^2$DiffSR employs a two-stage latent diffusion paradigm. During the first stage, an autoencoder is trained to capture the differential priors between high-resolution (HR) and LR images. The encoder intentionally ignores the existing LR content to alleviate the encoding burden, while the decoder introduces an SR branch equipped with a continuous scale upsampling module to accomplish the reconstruction under the guidance of the differential prior. In the second stage, a conditional diffusion model is learned within the latent space to predict the true differential prior encoding. Experimental results demonstrate that E$^2$DiffSR achieves superior objective metrics and visual quality compared to the state-of-the-art SR methods. Additionally, it reduces the inference time of diffusion-based SR methods to a level comparable to that of non-diffusion methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22830v1-abstract-full').style.display = 'none'; document.getElementById('2410.22830v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21359">arXiv:2410.21359</a> <span> [<a href="https://arxiv.org/pdf/2410.21359">pdf</a>, <a href="https://arxiv.org/format/2410.21359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="General Economics">econ.GN</span> </div> </div> <p class="title is-5 mathjax"> Can Machines Think Like Humans? A Behavioral Evaluation of LLM-Agents in Dictator Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Ji Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21359v1-abstract-short" style="display: inline;"> As Large Language Model (LLM)-based agents increasingly undertake real-world tasks and engage with human society, how well do we understand their behaviors? This study (1) investigates how LLM agents' prosocial behaviors -- a fundamental social norm -- can be induced by different personas and benchmarked against human behaviors; and (2) introduces a behavioral approach to evaluate the performance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21359v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21359v1-abstract-full" style="display: none;"> As Large Language Model (LLM)-based agents increasingly undertake real-world tasks and engage with human society, how well do we understand their behaviors? This study (1) investigates how LLM agents' prosocial behaviors -- a fundamental social norm -- can be induced by different personas and benchmarked against human behaviors; and (2) introduces a behavioral approach to evaluate the performance of LLM agents in complex decision-making scenarios. We explored how different personas and experimental framings affect these AI agents' altruistic behavior in dictator games and compared their behaviors within the same LLM family, across various families, and with human behaviors. Our findings reveal substantial variations and inconsistencies among LLMs and notable differences compared to human behaviors. Merely assigning a human-like identity to LLMs does not produce human-like behaviors. Despite being trained on extensive human-generated data, these AI agents cannot accurately predict human decisions. LLM agents are not able to capture the internal processes of human decision-making, and their alignment with human behavior is highly variable and dependent on specific model architectures and prompt formulations; even worse, such dependence does not follow a clear pattern. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21359v1-abstract-full').style.display = 'none'; document.getElementById('2410.21359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20786">arXiv:2410.20786</a> <span> [<a href="https://arxiv.org/pdf/2410.20786">pdf</a>, <a href="https://arxiv.org/format/2410.20786">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Constrained Policy Optimization: Improving Constrained Reinforcement Learning by Adapting Budgets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianmina Ma</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jingtian Ji</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20786v1-abstract-short" style="display: inline;"> Constrained reinforcement learning has achieved promising progress in safety-critical fields where both rewards and constraints are considered. However, constrained reinforcement learning methods face challenges in striking the right balance between task performance and constraint satisfaction and it is prone for them to get stuck in over-conservative or constraint violating local minima. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20786v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20786v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20786v1-abstract-full" style="display: none;"> Constrained reinforcement learning has achieved promising progress in safety-critical fields where both rewards and constraints are considered. However, constrained reinforcement learning methods face challenges in striking the right balance between task performance and constraint satisfaction and it is prone for them to get stuck in over-conservative or constraint violating local minima. In this paper, we propose Adversarial Constrained Policy Optimization (ACPO), which enables simultaneous optimization of reward and the adaptation of cost budgets during training. Our approach divides original constrained problem into two adversarial stages that are solved alternately, and the policy update performance of our algorithm can be theoretically guaranteed. We validate our method through experiments conducted on Safety Gymnasium and quadruped locomotion tasks. Results demonstrate that our algorithm achieves better performances compared to commonly used baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20786v1-abstract-full').style.display = 'none'; document.getElementById('2410.20786v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 8 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T01 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20688">arXiv:2410.20688</a> <span> [<a href="https://arxiv.org/pdf/2410.20688">pdf</a>, <a href="https://arxiv.org/format/2410.20688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Reprogramming Pretrained Target-Specific Diffusion Models for Dual-Target Drug Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiangxin Zhou</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiaqi Guan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yijia Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xingang Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianzhu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20688v2-abstract-short" style="display: inline;"> Dual-target therapeutic strategies have become a compelling approach and attracted significant attention due to various benefits, such as their potential in overcoming drug resistance in cancer therapy. Considering the tremendous success that deep generative models have achieved in structure-based drug design in recent years, we formulate dual-target drug design as a generative task and curate a n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20688v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20688v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20688v2-abstract-full" style="display: none;"> Dual-target therapeutic strategies have become a compelling approach and attracted significant attention due to various benefits, such as their potential in overcoming drug resistance in cancer therapy. Considering the tremendous success that deep generative models have achieved in structure-based drug design in recent years, we formulate dual-target drug design as a generative task and curate a novel dataset of potential target pairs based on synergistic drug combinations. We propose to design dual-target drugs with diffusion models that are trained on single-target protein-ligand complex pairs. Specifically, we align two pockets in 3D space with protein-ligand binding priors and build two complex graphs with shared ligand nodes for SE(3)-equivariant composed message passing, based on which we derive a composed drift in both 3D and categorical probability space in the generative process. Our algorithm can well transfer the knowledge gained in single-target pretraining to dual-target scenarios in a zero-shot manner. We also repurpose linker design methods as strong baselines for this task. Extensive experiments demonstrate the effectiveness of our method compared with various baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20688v2-abstract-full').style.display = 'none'; document.getElementById('2410.20688v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20230">arXiv:2410.20230</a> <span> [<a href="https://arxiv.org/pdf/2410.20230">pdf</a>, <a href="https://arxiv.org/format/2410.20230">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> FRTree Planner: Robot Navigation in Cluttered and Unknown Environments with Tree of Free Regions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yulin Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhicheng Song</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chunxin Zheng</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Z">Zhihai Bi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M+Y">Michael Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20230v1-abstract-short" style="display: inline;"> In this work, we present FRTree planner, a novel robot navigation framework that leverages a tree structure of free regions, specifically designed for navigation in cluttered and unknown environments with narrow passages. The framework continuously incorporates real-time perceptive information to identify distinct navigation options and dynamically expands the tree toward explorable and traversabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20230v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20230v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20230v1-abstract-full" style="display: none;"> In this work, we present FRTree planner, a novel robot navigation framework that leverages a tree structure of free regions, specifically designed for navigation in cluttered and unknown environments with narrow passages. The framework continuously incorporates real-time perceptive information to identify distinct navigation options and dynamically expands the tree toward explorable and traversable directions. This dynamically constructed tree incrementally encodes the geometric and topological information of the collision-free space, enabling efficient selection of the intermediate goals, navigating around dead-end situations, and avoidance of dynamic obstacles without a prior map. Crucially, our method performs a comprehensive analysis of the geometric relationship between free regions and the robot during online replanning. In particular, the planner assesses the accessibility of candidate passages based on the robot's geometries, facilitating the effective selection of the most viable intermediate goals through accessible narrow passages while minimizing unnecessary detours. By combining the free region information with a bi-level trajectory optimization tailored for robots with specific geometries, our approach generates robust and adaptable obstacle avoidance strategies in confined spaces. Through extensive simulations and real-world experiments, FRTree demonstrates its superiority over benchmark methods in generating safe, efficient motion plans through highly cluttered and unknown terrains with narrow gaps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20230v1-abstract-full').style.display = 'none'; document.getElementById('2410.20230v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20109">arXiv:2410.20109</a> <span> [<a href="https://arxiv.org/pdf/2410.20109">pdf</a>, <a href="https://arxiv.org/format/2410.20109">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> GiVE: Guiding Visual Encoder to Perceive Overlooked Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Junjie Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianghong Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaofeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuhang Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jianyang Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20109v1-abstract-short" style="display: inline;"> Multimodal Large Language Models have advanced AI in applications like text-to-video generation and visual question answering. These models rely on visual encoders to convert non-text data into vectors, but current encoders either lack semantic alignment or overlook non-salient objects. We propose the Guiding Visual Encoder to Perceive Overlooked Information (GiVE) approach. GiVE enhances visual r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20109v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20109v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20109v1-abstract-full" style="display: none;"> Multimodal Large Language Models have advanced AI in applications like text-to-video generation and visual question answering. These models rely on visual encoders to convert non-text data into vectors, but current encoders either lack semantic alignment or overlook non-salient objects. We propose the Guiding Visual Encoder to Perceive Overlooked Information (GiVE) approach. GiVE enhances visual representation with an Attention-Guided Adapter (AG-Adapter) module and an Object-focused Visual Semantic Learning module. These incorporate three novel loss terms: Object-focused Image-Text Contrast (OITC) loss, Object-focused Image-Image Contrast (OIIC) loss, and Object-focused Image Discrimination (OID) loss, improving object consideration, retrieval accuracy, and comprehensiveness. Our contributions include dynamic visual focus adjustment, novel loss functions to enhance object retrieval, and the Multi-Object Instruction (MOInst) dataset. Experiments show our approach achieves state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20109v1-abstract-full').style.display = 'none'; document.getElementById('2410.20109v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20057">arXiv:2410.20057</a> <span> [<a href="https://arxiv.org/pdf/2410.20057">pdf</a>, <a href="https://arxiv.org/format/2410.20057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Mechanism learning: Reverse causal inference in the presence of multiple unknown confounding through front-door causal bootstrapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jianqiao Mao</a>, <a href="/search/cs?searchtype=author&query=Little%2C+M+A">Max A. Little</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20057v1-abstract-short" style="display: inline;"> A major limitation of machine learning (ML) prediction models is that they recover associational, rather than causal, predictive relationships between variables. In high-stakes automation applications of ML this is problematic, as the model often learns spurious, non-causal associations. This paper proposes mechanism learning, a simple method which uses front-door causal bootstrapping to deconfoun… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20057v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20057v1-abstract-full" style="display: none;"> A major limitation of machine learning (ML) prediction models is that they recover associational, rather than causal, predictive relationships between variables. In high-stakes automation applications of ML this is problematic, as the model often learns spurious, non-causal associations. This paper proposes mechanism learning, a simple method which uses front-door causal bootstrapping to deconfound observational data such that any appropriate ML model is forced to learn predictive relationships between effects and their causes (reverse causal inference), despite the potential presence of multiple unknown and unmeasured confounding. Effect variables can be very high dimensional, and the predictive relationship nonlinear, as is common in ML applications. This novel method is widely applicable, the only requirement is the existence of a mechanism variable mediating the cause (prediction target) and effect (feature data), which is independent of the (unmeasured) confounding variables. We test our method on fully synthetic, semi-synthetic and real-world datasets, demonstrating that it can discover reliable, unbiased, causal ML predictors where by contrast, the same ML predictor trained naively using classical supervised learning on the original observational data, is heavily biased by spurious associations. We provide code to implement the results in the paper, online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20057v1-abstract-full').style.display = 'none'; document.getElementById('2410.20057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.4; G.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19989">arXiv:2410.19989</a> <span> [<a href="https://arxiv.org/pdf/2410.19989">pdf</a>, <a href="https://arxiv.org/format/2410.19989">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On-Robot Reinforcement Learning with Goal-Contrastive Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Biza%2C+O">Ondrej Biza</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+T">Thomas Weng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lingfeng Sun</a>, <a href="/search/cs?searchtype=author&query=Schmeckpeper%2C+K">Karl Schmeckpeper</a>, <a href="/search/cs?searchtype=author&query=Kelestemur%2C+T">Tarik Kelestemur</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y+J">Yecheng Jason Ma</a>, <a href="/search/cs?searchtype=author&query=Platt%2C+R">Robert Platt</a>, <a href="/search/cs?searchtype=author&query=van+de+Meent%2C+J">Jan-Willem van de Meent</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+L+L+S">Lawson L. S. Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19989v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) has the potential to enable robots to learn from their own actions in the real world. Unfortunately, RL can be prohibitively expensive, in terms of on-robot runtime, due to inefficient exploration when learning from a sparse reward signal. Designing dense reward functions is labour-intensive and requires domain expertise. In our work, we propose GCR (Goal-Contrastive Re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19989v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19989v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19989v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) has the potential to enable robots to learn from their own actions in the real world. Unfortunately, RL can be prohibitively expensive, in terms of on-robot runtime, due to inefficient exploration when learning from a sparse reward signal. Designing dense reward functions is labour-intensive and requires domain expertise. In our work, we propose GCR (Goal-Contrastive Rewards), a dense reward function learning method that can be trained on passive video demonstrations. By using videos without actions, our method is easier to scale, as we can use arbitrary videos. GCR combines two loss functions, an implicit value loss function that models how the reward increases when traversing a successful trajectory, and a goal-contrastive loss that discriminates between successful and failed trajectories. We perform experiments in simulated manipulation environments across RoboMimic and MimicGen tasks, as well as in the real world using a Franka arm and a Spot quadruped. We find that GCR leads to a more-sample efficient RL, enabling model-free RL to solve about twice as many tasks as our baseline reward learning methods. We also demonstrate positive cross-embodiment transfer from videos of people and of other robots performing a task. Appendix: \url{https://tinyurl.com/gcr-appendix-2}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19989v1-abstract-full').style.display = 'none'; document.getElementById('2410.19989v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19978">arXiv:2410.19978</a> <span> [<a href="https://arxiv.org/pdf/2410.19978">pdf</a>, <a href="https://arxiv.org/format/2410.19978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Global Graph Counterfactual Explanation: A Subgraph Mapping Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yinhan He</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wendy Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yaochen Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+S">Saumitra Mishra</a>, <a href="/search/cs?searchtype=author&query=Raman%2C+N">Natraj Raman</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ninghao Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19978v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have been widely deployed in various real-world applications. However, most GNNs are black-box models that lack explanations. One strategy to explain GNNs is through counterfactual explanation, which aims to find minimum perturbations on input graphs that change the GNN predictions. Existing works on GNN counterfactual explanations primarily concentrate on the local-le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19978v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19978v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19978v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have been widely deployed in various real-world applications. However, most GNNs are black-box models that lack explanations. One strategy to explain GNNs is through counterfactual explanation, which aims to find minimum perturbations on input graphs that change the GNN predictions. Existing works on GNN counterfactual explanations primarily concentrate on the local-level perspective (i.e., generating counterfactuals for each individual graph), which suffers from information overload and lacks insights into the broader cross-graph relationships. To address such issues, we propose GlobalGCE, a novel global-level graph counterfactual explanation method. GlobalGCE aims to identify a collection of subgraph mapping rules as counterfactual explanations for the target GNN. According to these rules, substituting certain significant subgraphs with their counterfactual subgraphs will change the GNN prediction to the desired class for most graphs (i.e., maximum coverage). Methodologically, we design a significant subgraph generator and a counterfactual subgraph autoencoder in our GlobalGCE, where the subgraphs and the rules can be effectively generated. Extensive experiments demonstrate the superiority of our GlobalGCE compared to existing baselines. Our code can be found at https://anonymous.4open.science/r/GlobalGCE-92E8. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19978v1-abstract-full').style.display = 'none'; document.getElementById('2410.19978v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Mao%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Mao%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Mao%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Mao%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Mao%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Mao%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository