CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 2,071 results for author: <span class="mathjax">Zhang, Q</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, Q"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+Q&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, Q"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01821">arXiv:2412.01821</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.01821">pdf</a>, <a href="https://arxiv.org/format/2412.01821">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> World-consistent Video Diffusion with Explicit 3D Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qihang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+S">Shuangfei Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Bautista%2C+M+A">Miguel Angel Bautista</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+K">Kevin Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Toshev%2C+A">Alexander Toshev</a>, <a href="/search/cs?searchtype=author&amp;query=Susskind%2C+J">Joshua Susskind</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiatao Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01821v1-abstract-short" style="display: inline;"> Recent advancements in diffusion models have set new benchmarks in image and video generation, enabling realistic visual synthesis across single- and multi-frame contexts. However, these models still struggle with efficiently and explicitly generating 3D-consistent content. To address this, we propose World-consistent Video Diffusion (WVD), a novel framework that incorporates explicit 3D supervisi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01821v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01821v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01821v1-abstract-full" style="display: none;"> Recent advancements in diffusion models have set new benchmarks in image and video generation, enabling realistic visual synthesis across single- and multi-frame contexts. However, these models still struggle with efficiently and explicitly generating 3D-consistent content. To address this, we propose World-consistent Video Diffusion (WVD), a novel framework that incorporates explicit 3D supervision using XYZ images, which encode global 3D coordinates for each image pixel. More specifically, we train a diffusion transformer to learn the joint distribution of RGB and XYZ frames. This approach supports multi-task adaptability via a flexible inpainting strategy. For example, WVD can estimate XYZ frames from ground-truth RGB or generate novel RGB frames using XYZ projections along a specified camera trajectory. In doing so, WVD unifies tasks like single-image-to-3D generation, multi-view stereo, and camera-controlled video generation. Our approach demonstrates competitive performance across multiple benchmarks, providing a scalable solution for 3D-consistent video and image generation with a single pretrained model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01821v1-abstract-full').style.display = 'none'; document.getElementById('2412.01821v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01818">arXiv:2412.01818</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.01818">pdf</a>, <a href="https://arxiv.org/format/2412.01818">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> [CLS] Attention is All You Need for Training-Free Visual Token Pruning: Make VLM Inference Faster </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qizhe Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+A">Aosong Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+M">Ming Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuo%2C+Z">Zhiyong Zhuo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Minqi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jiajun Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+S">Shaobo Guo</a>, <a href="/search/cs?searchtype=author&amp;query=She%2C+Q">Qi She</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01818v1-abstract-short" style="display: inline;"> Large vision-language models (VLMs) often rely on a substantial number of visual tokens when interacting with large language models (LLMs), which has proven to be inefficient. Recent efforts have aimed to accelerate VLM inference by pruning visual tokens. Most existing methods assess the importance of visual tokens based on the text-visual cross-attentions in LLMs. In this study, we find that the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01818v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01818v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01818v1-abstract-full" style="display: none;"> Large vision-language models (VLMs) often rely on a substantial number of visual tokens when interacting with large language models (LLMs), which has proven to be inefficient. Recent efforts have aimed to accelerate VLM inference by pruning visual tokens. Most existing methods assess the importance of visual tokens based on the text-visual cross-attentions in LLMs. In this study, we find that the cross-attentions between text and visual tokens in LLMs are inaccurate. Pruning tokens based on these inaccurate attentions leads to significant performance degradation, especially at high reduction ratios. To this end, we introduce FasterVLM, a simple yet effective training-free visual token pruning method that evaluates the importance of visual tokens more accurately by utilizing attentions between the [CLS] token and image tokens from the visual encoder. Since FasterVLM eliminates redundant visual tokens immediately after the visual encoder, ensuring they do not interact with LLMs and resulting in faster VLM inference. It is worth noting that, benefiting from the accuracy of [CLS] cross-attentions, FasterVLM can prune 95\% of visual tokens while maintaining 90\% of the performance of LLaVA-1.5-7B. We apply FasterVLM to various VLMs, including LLaVA-1.5, LLaVA-NeXT, and Video-LLaVA, to demonstrate its effectiveness. Experimental results show that our FasterVLM maintains strong performance across various VLM architectures and reduction ratios, significantly outperforming existing text-visual attention-based methods. Our code is available at https://github.com/Theia-4869/FasterVLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01818v1-abstract-full').style.display = 'none'; document.getElementById('2412.01818v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 11 figures, code: https://github.com/Theia-4869/FasterVLM, project page: https://theia-4869.github.io/FasterVLM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00715">arXiv:2412.00715</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.00715">pdf</a>, <a href="https://arxiv.org/format/2412.00715">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Semi-Supervised Approach with Error Reflection for Echocardiography Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xiaoxiang Han</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yiman Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Shang%2C+J">Jiang Shang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qingli Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiangang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+M">Menghan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yuqi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00715v1-abstract-short" style="display: inline;"> Segmenting internal structure from echocardiography is essential for the diagnosis and treatment of various heart diseases. Semi-supervised learning shows its ability in alleviating annotations scarcity. While existing semi-supervised methods have been successful in image segmentation across various medical imaging modalities, few have attempted to design methods specifically addressing the challe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00715v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00715v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00715v1-abstract-full" style="display: none;"> Segmenting internal structure from echocardiography is essential for the diagnosis and treatment of various heart diseases. Semi-supervised learning shows its ability in alleviating annotations scarcity. While existing semi-supervised methods have been successful in image segmentation across various medical imaging modalities, few have attempted to design methods specifically addressing the challenges posed by the poor contrast, blurred edge details and noise of echocardiography. These characteristics pose challenges to the generation of high-quality pseudo-labels in semi-supervised segmentation based on Mean Teacher. Inspired by human reflection on erroneous practices, we devise an error reflection strategy for echocardiography semi-supervised segmentation architecture. The process triggers the model to reflect on inaccuracies in unlabeled image segmentation, thereby enhancing the robustness of pseudo-label generation. Specifically, the strategy is divided into two steps. The first step is called reconstruction reflection. The network is tasked with reconstructing authentic proxy images from the semantic masks of unlabeled images and their auxiliary sketches, while maximizing the structural similarity between the original inputs and the proxies. The second step is called guidance correction. Reconstruction error maps decouple unreliable segmentation regions. Then, reliable data that are more likely to occur near high-density areas are leveraged to guide the optimization of unreliable data potentially located around decision boundaries. Additionally, we introduce an effective data augmentation strategy, termed as multi-scale mixing up strategy, to minimize the empirical distribution gap between labeled and unlabeled images and perceive diverse scales of cardiac anatomical structures. Extensive experiments demonstrate the competitiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00715v1-abstract-full').style.display = 'none'; document.getElementById('2412.00715v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figure, accepted by 2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00621">arXiv:2412.00621</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.00621">pdf</a>, <a href="https://arxiv.org/format/2412.00621">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Exposing LLM Vulnerabilities: Adversarial Scam Detection and Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chang%2C+C">Chen-Wei Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">Shailik Sarkar</a>, <a href="/search/cs?searchtype=author&amp;query=Mitra%2C+S">Shutonu Mitra</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Salemi%2C+H">Hossein Salemi</a>, <a href="/search/cs?searchtype=author&amp;query=Purohit%2C+H">Hemant Purohit</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fengxiu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+M">Michin Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+J">Jin-Hee Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+C">Chang-Tien Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00621v1-abstract-short" style="display: inline;"> Can we trust Large Language Models (LLMs) to accurately predict scam? This paper investigates the vulnerabilities of LLMs when facing adversarial scam messages for the task of scam detection. We addressed this issue by creating a comprehensive dataset with fine-grained labels of scam messages, including both original and adversarial scam messages. The dataset extended traditional binary classes fo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00621v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00621v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00621v1-abstract-full" style="display: none;"> Can we trust Large Language Models (LLMs) to accurately predict scam? This paper investigates the vulnerabilities of LLMs when facing adversarial scam messages for the task of scam detection. We addressed this issue by creating a comprehensive dataset with fine-grained labels of scam messages, including both original and adversarial scam messages. The dataset extended traditional binary classes for the scam detection task into more nuanced scam types. Our analysis showed how adversarial examples took advantage of vulnerabilities of a LLM, leading to high misclassification rate. We evaluated the performance of LLMs on these adversarial scam messages and proposed strategies to improve their robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00621v1-abstract-full').style.display = 'none'; document.getElementById('2412.00621v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 2024 IEEE International Conference on Big Data workshop BigEACPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18875">arXiv:2411.18875</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18875">pdf</a>, <a href="https://arxiv.org/format/2411.18875">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Know Your Account: Double Graph Inference-based Account De-anonymization on Ethereum </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+S">Shuyi Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+W">Wangjie Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+H">Hongwei Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qinnan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tu%2C+X">Xiaofan Tu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xunan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+J">Jin Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zhiming Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18875v1-abstract-short" style="display: inline;"> The scaled Web 3.0 digital economy, represented by decentralized finance (DeFi), has sparked increasing interest in the past few years, which usually relies on blockchain for token transfer and diverse transaction logic. However, illegal behaviors, such as financial fraud, hacker attacks, and money laundering, are rampant in the blockchain ecosystem and seriously threaten its integrity and securit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18875v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18875v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18875v1-abstract-full" style="display: none;"> The scaled Web 3.0 digital economy, represented by decentralized finance (DeFi), has sparked increasing interest in the past few years, which usually relies on blockchain for token transfer and diverse transaction logic. However, illegal behaviors, such as financial fraud, hacker attacks, and money laundering, are rampant in the blockchain ecosystem and seriously threaten its integrity and security. In this paper, we propose a novel double graph-based Ethereum account de-anonymization inference method, dubbed DBG4ETH, which aims to capture the behavioral patterns of accounts comprehensively and has more robust analytical and judgment capabilities for current complex and continuously generated transaction behaviors. Specifically, we first construct a global static graph to build complex interactions between the various account nodes for all transaction data. Then, we also construct a local dynamic graph to learn about the gradual evolution of transactions over different periods. Different graphs focus on information from different perspectives, and features of global and local, static and dynamic transaction graphs are available through DBG4ETH. In addition, we propose an adaptive confidence calibration method to predict the results by feeding the calibrated weighted prediction values into the classifier. Experimental results show that DBG4ETH achieves state-of-the-art results in the account identification task, improving the F1-score by at least 3.75% and up to 40.52% compared to processing each graph type individually and outperforming similar account identity inference methods by 5.23% to 12.91%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18875v1-abstract-full').style.display = 'none'; document.getElementById('2411.18875v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18615">arXiv:2411.18615</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18615">pdf</a>, <a href="https://arxiv.org/format/2411.18615">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Proactive Gradient Conflict Mitigation in Multi-Task Learning: A Sparse Training Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+C">Congfeng Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+G">Gaole Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+S">Shiji Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qizhe Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shanghang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Shutova%2C+E">Ekaterina Shutova</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18615v1-abstract-short" style="display: inline;"> Advancing towards generalist agents necessitates the concurrent processing of multiple tasks using a unified model, thereby underscoring the growing significance of simultaneous model training on multiple downstream tasks. A common issue in multi-task learning is the occurrence of gradient conflict, which leads to potential competition among different tasks during joint training. This competition&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18615v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18615v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18615v1-abstract-full" style="display: none;"> Advancing towards generalist agents necessitates the concurrent processing of multiple tasks using a unified model, thereby underscoring the growing significance of simultaneous model training on multiple downstream tasks. A common issue in multi-task learning is the occurrence of gradient conflict, which leads to potential competition among different tasks during joint training. This competition often results in improvements in one task at the expense of deterioration in another. Although several optimization methods have been developed to address this issue by manipulating task gradients for better task balancing, they cannot decrease the incidence of gradient conflict. In this paper, we systematically investigate the occurrence of gradient conflict across different methods and propose a strategy to reduce such conflicts through sparse training (ST), wherein only a portion of the model&#39;s parameters are updated during training while keeping the rest unchanged. Our extensive experiments demonstrate that ST effectively mitigates conflicting gradients and leads to superior performance. Furthermore, ST can be easily integrated with gradient manipulation techniques, thus enhancing their effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18615v1-abstract-full').style.display = 'none'; document.getElementById('2411.18615v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18279">arXiv:2411.18279</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18279">pdf</a>, <a href="https://arxiv.org/format/2411.18279">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model-Brained GUI Agents: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chaoyun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+S">Shilin He</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+J">Jiaxu Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bowen Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Liqun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+S">Si Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+Y">Yu Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+M">Minghua Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+G">Guyue Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Q">Qingwei Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Rajmohan%2C+S">Saravan Rajmohan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dongmei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18279v2-abstract-short" style="display: inline;"> GUIs have long been central to human-computer interaction, providing an intuitive and visually-driven way to access and interact with digital systems. The advent of LLMs, particularly multimodal models, has ushered in a new era of GUI automation. They have demonstrated exceptional capabilities in natural language understanding, code generation, and visual processing. This has paved the way for a n&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18279v2-abstract-full').style.display = 'inline'; document.getElementById('2411.18279v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18279v2-abstract-full" style="display: none;"> GUIs have long been central to human-computer interaction, providing an intuitive and visually-driven way to access and interact with digital systems. The advent of LLMs, particularly multimodal models, has ushered in a new era of GUI automation. They have demonstrated exceptional capabilities in natural language understanding, code generation, and visual processing. This has paved the way for a new generation of LLM-brained GUI agents capable of interpreting complex GUI elements and autonomously executing actions based on natural language instructions. These agents represent a paradigm shift, enabling users to perform intricate, multi-step tasks through simple conversational commands. Their applications span across web navigation, mobile app interactions, and desktop automation, offering a transformative user experience that revolutionizes how individuals interact with software. This emerging field is rapidly advancing, with significant progress in both research and industry. To provide a structured understanding of this trend, this paper presents a comprehensive survey of LLM-brained GUI agents, exploring their historical evolution, core components, and advanced techniques. We address research questions such as existing GUI agent frameworks, the collection and utilization of data for training specialized GUI agents, the development of large action models tailored for GUI tasks, and the evaluation metrics and benchmarks necessary to assess their effectiveness. Additionally, we examine emerging applications powered by these agents. Through a detailed analysis, this survey identifies key research gaps and outlines a roadmap for future advancements in the field. By consolidating foundational knowledge and state-of-the-art developments, this work aims to guide both researchers and practitioners in overcoming challenges and unlocking the full potential of LLM-brained GUI agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18279v2-abstract-full').style.display = 'none'; document.getElementById('2411.18279v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The collection of papers reviewed in this survey will be hosted and regularly updated on the GitHub repository: https://github.com/vyokky/LLM-Brained-GUI-Agents-Survey Additionally, a searchable webpage is available at https://aka.ms/gui-agent for easier access and exploration</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18263">arXiv:2411.18263</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18263">pdf</a>, <a href="https://arxiv.org/format/2411.18263">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TSD-SR: One-Step Diffusion with Target Score Distillation for Real-World Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+L">Linwei Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+Q">Qingnan Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Y">Yihong Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhonghao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jinwei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yawei Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+C">Changqing Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18263v1-abstract-short" style="display: inline;"> Pre-trained text-to-image diffusion models are increasingly applied to real-world image super-resolution (Real-ISR) task. Given the iterative refinement nature of diffusion models, most existing approaches are computationally expensive. While methods such as SinSR and OSEDiff have emerged to condense inference steps via distillation, their performance in image restoration or details recovery is no&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18263v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18263v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18263v1-abstract-full" style="display: none;"> Pre-trained text-to-image diffusion models are increasingly applied to real-world image super-resolution (Real-ISR) task. Given the iterative refinement nature of diffusion models, most existing approaches are computationally expensive. While methods such as SinSR and OSEDiff have emerged to condense inference steps via distillation, their performance in image restoration or details recovery is not satisfied. To address this, we propose TSD-SR, a novel distillation framework specifically designed for real-world image super-resolution, aiming to construct an efficient and effective one-step model. We first introduce the Target Score Distillation, which leverages the priors of diffusion models and real image references to achieve more realistic image restoration. Secondly, we propose a Distribution-Aware Sampling Module to make detail-oriented gradients more readily accessible, addressing the challenge of recovering fine details. Extensive experiments demonstrate that our TSD-SR has superior restoration results (most of the metrics perform the best) and the fastest inference speed (e.g. 40 times faster than SeeSR) compared to the past Real-ISR approaches based on pre-trained diffusion priors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18263v1-abstract-full').style.display = 'none'; document.getElementById('2411.18263v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17240">arXiv:2411.17240</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17240">pdf</a>, <a href="https://arxiv.org/format/2411.17240">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Boost 3D Reconstruction using Diffusion-based Monocular Camera Calibration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deng%2C+J">Junyuan Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+W">Wei Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+X">Xiaoyang Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xiaotao Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+W">Weiqiang Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+X">Xiaoxiao Long</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+P">Ping Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17240v1-abstract-short" style="display: inline;"> In this paper, we present DM-Calib, a diffusion-based approach for estimating pinhole camera intrinsic parameters from a single input image. Monocular camera calibration is essential for many 3D vision tasks. However, most existing methods depend on handcrafted assumptions or are constrained by limited training data, resulting in poor generalization across diverse real-world images. Recent advance&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17240v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17240v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17240v1-abstract-full" style="display: none;"> In this paper, we present DM-Calib, a diffusion-based approach for estimating pinhole camera intrinsic parameters from a single input image. Monocular camera calibration is essential for many 3D vision tasks. However, most existing methods depend on handcrafted assumptions or are constrained by limited training data, resulting in poor generalization across diverse real-world images. Recent advancements in stable diffusion models, trained on massive data, have shown the ability to generate high-quality images with varied characteristics. Emerging evidence indicates that these models implicitly capture the relationship between camera focal length and image content. Building on this insight, we explore how to leverage the powerful priors of diffusion models for monocular pinhole camera calibration. Specifically, we introduce a new image-based representation, termed Camera Image, which losslessly encodes the numerical camera intrinsics and integrates seamlessly with the diffusion framework. Using this representation, we reformulate the problem of estimating camera intrinsics as the generation of a dense Camera Image conditioned on an input image. By fine-tuning a stable diffusion model to generate a Camera Image from a single RGB input, we can extract camera intrinsics via a RANSAC operation. We further demonstrate that our monocular calibration method enhances performance across various 3D tasks, including zero-shot metric depth estimation, 3D metrology, pose estimation and sparse-view reconstruction. Extensive experiments on multiple public datasets show that our approach significantly outperforms baselines and provides broad benefits to 3D vision tasks. Code is available at https://github.com/JunyuanDeng/DM-Calib. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17240v1-abstract-full').style.display = 'none'; document.getElementById('2411.17240v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16807">arXiv:2411.16807</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16807">pdf</a>, <a href="https://arxiv.org/format/2411.16807">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ADAF: An Artificial Intelligence Data Assimilation Framework for Weather Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+Y">Yanfei Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+W">Weixin Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+H">Haiyu Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+M">Mingliang Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+Z">Zuliang Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+P">Pengcheng Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Hongyu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Thambiratnam%2C+K">Kit Thambiratnam</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xiaomeng Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16807v1-abstract-short" style="display: inline;"> The forecasting skill of numerical weather prediction (NWP) models critically depends on the accurate initial conditions, also known as analysis, provided by data assimilation (DA). Traditional DA methods often face a trade-off between computational cost and accuracy due to complex linear algebra computations and the high dimensionality of the model, especially in nonlinear systems. Moreover, proc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16807v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16807v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16807v1-abstract-full" style="display: none;"> The forecasting skill of numerical weather prediction (NWP) models critically depends on the accurate initial conditions, also known as analysis, provided by data assimilation (DA). Traditional DA methods often face a trade-off between computational cost and accuracy due to complex linear algebra computations and the high dimensionality of the model, especially in nonlinear systems. Moreover, processing massive data in real-time requires substantial computational resources. To address this, we introduce an artificial intelligence-based data assimilation framework (ADAF) to generate high-quality kilometer-scale analysis. This study is the pioneering work using real-world observations from varied locations and multiple sources to verify the AI method&#39;s efficacy in DA, including sparse surface weather observations and satellite imagery. We implemented ADAF for four near-surface variables in the Contiguous United States (CONUS). The results indicate that ADAF surpasses the High Resolution Rapid Refresh Data Assimilation System (HRRRDAS) in accuracy by 16% to 33% for near-surface atmospheric conditions, aligning more closely with actual observations, and can effectively reconstruct extreme events, such as tropical cyclone wind fields. Sensitivity experiments reveal that ADAF can generate high-quality analysis even with low-accuracy backgrounds and extremely sparse surface observations. ADAF can assimilate massive observations within a three-hour window at low computational cost, taking about two seconds on an AMD MI200 graphics processing unit (GPU). ADAF has been shown to be efficient and effective in real-world DA, underscoring its potential role in operational weather forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16807v1-abstract-full').style.display = 'none'; document.getElementById('2411.16807v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16579">arXiv:2411.16579</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16579">pdf</a>, <a href="https://arxiv.org/format/2411.16579">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing LLM Reasoning via Critique Models with Test-Time and Training-Time Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xi%2C+Z">Zhiheng Xi</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Dingwen Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Jixuan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jiafu Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Guanyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Y">Yiwen Ding</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+W">Wei He</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+B">Boyang Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Do%2C+S">Shihan Do</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+W">Wenyu Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+R">Rui Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+T">Tao Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+X">Xiaowei Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+Y">Yitao Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Weng%2C+R">Rongxiang Weng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+X">Xunliang Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Gui%2C+T">Tao Gui</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+X">Xipeng Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16579v1-abstract-short" style="display: inline;"> Training large language models (LLMs) to spend more time thinking and reflection before responding is crucial for effectively solving complex reasoning tasks in fields such as science, coding, and mathematics. However, the effectiveness of mechanisms like self-reflection and self-correction depends on the model&#39;s capacity to accurately assess its own performance, which can be limited by factors su&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16579v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16579v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16579v1-abstract-full" style="display: none;"> Training large language models (LLMs) to spend more time thinking and reflection before responding is crucial for effectively solving complex reasoning tasks in fields such as science, coding, and mathematics. However, the effectiveness of mechanisms like self-reflection and self-correction depends on the model&#39;s capacity to accurately assess its own performance, which can be limited by factors such as initial accuracy, question difficulty, and the lack of external feedback. In this paper, we delve into a two-player paradigm that separates the roles of reasoning and critique models, where the critique model provides step-level feedback to supervise the reasoning (actor) model during both test-time and train-time. We first propose AutoMathCritique, an automated and scalable framework for collecting critique data, resulting in a dataset of $76,321$ responses paired with step-level feedback. Fine-tuning language models with this dataset enables them to generate natural language feedback for mathematical reasoning. We demonstrate that the critique models consistently improve the actor&#39;s performance on difficult queries at test-time, especially when scaling up inference-time computation. Motivated by these findings, we introduce the critique-based supervision to the actor&#39;s self-training process, and propose a critique-in-the-loop self-improvement method. Experiments show that the method improves the actor&#39;s exploration efficiency and solution diversity, especially on challenging queries, leading to a stronger reasoning model. Lastly, we take the preliminary step to explore training self-talk reasoning models via critique supervision and showcase its potential. Our code and datasets are at \href{https://mathcritique.github.io/}{https://mathcritique.github.io/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16579v1-abstract-full').style.display = 'none'; document.getElementById('2411.16579v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15702">arXiv:2411.15702</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15702">pdf</a>, <a href="https://arxiv.org/format/2411.15702">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Editable-DeepSC: Reliable Cross-Modal Semantic Communications for Facial Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Wenbo Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qinshan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15702v1-abstract-short" style="display: inline;"> Real-time computer vision (CV) plays a crucial role in various real-world applications, whose performance is highly dependent on communication networks. Nonetheless, the data-oriented characteristics of conventional communications often do not align with the special needs of real-time CV tasks. To alleviate this issue, the recently emerged semantic communications only transmit task-related semanti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15702v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15702v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15702v1-abstract-full" style="display: none;"> Real-time computer vision (CV) plays a crucial role in various real-world applications, whose performance is highly dependent on communication networks. Nonetheless, the data-oriented characteristics of conventional communications often do not align with the special needs of real-time CV tasks. To alleviate this issue, the recently emerged semantic communications only transmit task-related semantic information and exhibit a promising landscape to address this problem. However, the communication challenges associated with Semantic Facial Editing, one of the most important real-time CV applications on social media, still remain largely unexplored. In this paper, we fill this gap by proposing Editable-DeepSC, a novel cross-modal semantic communication approach for facial editing. Firstly, we theoretically discuss different transmission schemes that separately handle communications and editings, and emphasize the necessity of Joint Editing-Channel Coding (JECC) via iterative attributes matching, which integrates editings into the communication chain to preserve more semantic mutual information. To compactly represent the high-dimensional data, we leverage inversion methods via pre-trained StyleGAN priors for semantic coding. To tackle the dynamic channel noise conditions, we propose SNR-aware channel coding via model fine-tuning. Extensive experiments indicate that Editable-DeepSC can achieve superior editings while significantly saving the transmission bandwidth, even under high-resolution and out-of-distribution (OOD) settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15702v1-abstract-full').style.display = 'none'; document.getElementById('2411.15702v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15438">arXiv:2411.15438</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15438">pdf</a>, <a href="https://arxiv.org/format/2411.15438">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Efficient Ternary Weight Embedding Model: Bridging Scalability and Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiayi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Chen Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shaoqun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+N">Nan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liangjie Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15438v1-abstract-short" style="display: inline;"> Embedding models have become essential tools in both natural language processing and computer vision, enabling efficient semantic search, recommendation, clustering, and more. However, the high memory and computational demands of full-precision embeddings pose challenges for deployment in resource-constrained environments, such as real-time recommendation systems. In this work, we propose a novel&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15438v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15438v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15438v1-abstract-full" style="display: none;"> Embedding models have become essential tools in both natural language processing and computer vision, enabling efficient semantic search, recommendation, clustering, and more. However, the high memory and computational demands of full-precision embeddings pose challenges for deployment in resource-constrained environments, such as real-time recommendation systems. In this work, we propose a novel finetuning framework to ternary-weight embedding models, which reduces memory and computational overhead while maintaining high performance. To apply ternarization to pre-trained embedding models, we introduce self-taught knowledge distillation to finalize the ternary-weights of the linear layers. With extensive experiments on public text and vision datasets, we demonstrated that without sacrificing effectiveness, the ternarized model consumes low memory usage and has low latency in the inference stage with great efficiency. In practical implementations, embedding models are typically integrated with Approximate Nearest Neighbor (ANN) search. Our experiments combining ternary embedding with ANN search yielded impressive improvement in both accuracy and computational efficiency. The repository is available at here. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15438v1-abstract-full').style.display = 'none'; document.getElementById('2411.15438v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15139">arXiv:2411.15139</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15139">pdf</a>, <a href="https://arxiv.org/format/2411.15139">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> DiffusionDrive: Truncated Diffusion Model for End-to-End Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liao%2C+B">Bencheng Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shaoyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+H">Haoran Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+S">Sixu Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xinbang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiangyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinggang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15139v1-abstract-short" style="display: inline;"> Recently, the diffusion model has emerged as a powerful generative technique for robotic policy learning, capable of modeling multi-mode action distributions. Leveraging its capability for end-to-end autonomous driving is a promising direction. However, the numerous denoising steps in the robotic diffusion policy and the more dynamic, open-world nature of traffic scenes pose substantial challenges&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15139v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15139v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15139v1-abstract-full" style="display: none;"> Recently, the diffusion model has emerged as a powerful generative technique for robotic policy learning, capable of modeling multi-mode action distributions. Leveraging its capability for end-to-end autonomous driving is a promising direction. However, the numerous denoising steps in the robotic diffusion policy and the more dynamic, open-world nature of traffic scenes pose substantial challenges for generating diverse driving actions at a real-time speed. To address these challenges, we propose a novel truncated diffusion policy that incorporates prior multi-mode anchors and truncates the diffusion schedule, enabling the model to learn denoising from anchored Gaussian distribution to the multi-mode driving action distribution. Additionally, we design an efficient cascade diffusion decoder for enhanced interaction with conditional scene context. The proposed model, DiffusionDrive, demonstrates 10$\times$ reduction in denoising steps compared to vanilla diffusion policy, delivering superior diversity and quality in just 2 steps. On the planning-oriented NAVSIM dataset, with the aligned ResNet-34 backbone, DiffusionDrive achieves 88.1 PDMS without bells and whistles, setting a new record, while running at a real-time speed of 45 FPS on an NVIDIA 4090. Qualitative results on challenging scenarios further confirm that DiffusionDrive can robustly generate diverse plausible driving actions. Code and model will be available at https://github.com/hustvl/DiffusionDrive. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15139v1-abstract-full').style.display = 'none'; document.getElementById('2411.15139v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Code &amp; demo &amp; model will be available at https://github.com/hustvl/DiffusionDrive</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14500">arXiv:2411.14500</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14500">pdf</a>, <a href="https://arxiv.org/format/2411.14500">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring Accuracy-Fairness Trade-off in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qingquan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+Q">Qiqi Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+B">Bo Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Y">Yuhui Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jialin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14500v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have made significant strides in the field of artificial intelligence, showcasing their ability to interact with humans and influence human cognition through information dissemination. However, recent studies have brought to light instances of bias inherent within these LLMs, presenting a critical issue that demands attention. In our research, we delve deeper into the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14500v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14500v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14500v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have made significant strides in the field of artificial intelligence, showcasing their ability to interact with humans and influence human cognition through information dissemination. However, recent studies have brought to light instances of bias inherent within these LLMs, presenting a critical issue that demands attention. In our research, we delve deeper into the intricate challenge of harmonising accuracy and fairness in the enhancement of LLMs. While improving accuracy can indeed enhance overall LLM performance, it often occurs at the expense of fairness. Overemphasising optimisation of one metric invariably leads to a significant degradation of the other. This underscores the necessity of taking into account multiple considerations during the design and optimisation phases of LLMs. Therefore, we advocate for reformulating the LLM training process as a multi-objective learning task. Our investigation reveals that multi-objective evolutionary learning (MOEL) methodologies offer promising avenues for tackling this challenge. Our MOEL framework enables the simultaneous optimisation of both accuracy and fairness metrics, resulting in a Pareto-optimal set of LLMs. In summary, our study sheds valuable lights on the delicate equilibrium between accuracy and fairness within LLMs, which is increasingly significant for their real-world applications. By harnessing MOEL, we present a promising pathway towards fairer and more efficacious AI technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14500v1-abstract-full').style.display = 'none'; document.getElementById('2411.14500v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11741">arXiv:2411.11741</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11741">pdf</a>, <a href="https://arxiv.org/format/2411.11741">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> </div> </div> <p class="title is-5 mathjax"> A Bicriterion Concentration Inequality and Prophet Inequalities for $k$-Fold Matroid Unions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alon%2C+N">Noga Alon</a>, <a href="/search/cs?searchtype=author&amp;query=Gravin%2C+N">Nick Gravin</a>, <a href="/search/cs?searchtype=author&amp;query=Pollner%2C+T">Tristan Pollner</a>, <a href="/search/cs?searchtype=author&amp;query=Rubinstein%2C+A">Aviad Rubinstein</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Weinberg%2C+S+M">S. Matthew Weinberg</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qianfan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11741v2-abstract-short" style="display: inline;"> We investigate prophet inequalities with competitive ratios approaching $1$, seeking to generalize $k$-uniform matroids. We first show that large girth does not suffice: for all $k$, there exists a matroid of girth $\geq k$ and a prophet inequality instance on that matroid whose optimal competitive ratio is $\frac{1}{2}$. Next, we show $k$-fold matroid unions do suffice: we provide a prophet inequ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11741v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11741v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11741v2-abstract-full" style="display: none;"> We investigate prophet inequalities with competitive ratios approaching $1$, seeking to generalize $k$-uniform matroids. We first show that large girth does not suffice: for all $k$, there exists a matroid of girth $\geq k$ and a prophet inequality instance on that matroid whose optimal competitive ratio is $\frac{1}{2}$. Next, we show $k$-fold matroid unions do suffice: we provide a prophet inequality with competitive ratio $1-O(\sqrt{\frac{\log k}{k}})$ for any $k$-fold matroid union. Our prophet inequality follows from an online contention resolution scheme. The key technical ingredient in our online contention resolution scheme is a novel bicriterion concentration inequality for arbitrary monotone $1$-Lipschitz functions over independent items which may be of independent interest. Applied to our particular setting, our bicriterion concentration inequality yields &#34;Chernoff-strength&#34; concentration for a $1$-Lipschitz function that is not (approximately) self-bounding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11741v2-abstract-full').style.display = 'none'; document.getElementById('2411.11741v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in ITCS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11225">arXiv:2411.11225</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11225">pdf</a>, <a href="https://arxiv.org/format/2411.11225">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Online Item Cold-Start Recommendation with Popularity-Aware Meta-Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yunze Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yuezihan Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yinjie Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Gaode Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingchi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Bian%2C+K">Kaigui Bian</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+P">Peiyi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11225v2-abstract-short" style="display: inline;"> With the rise of e-commerce and short videos, online recommender systems that can capture users&#39; interests and update new items in real-time play an increasingly important role. In both online and offline recommendation, the cold-start problem due to interaction sparsity has been affecting the recommendation effect of cold-start items, which is also known as the long-tail problem of item distribut&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11225v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11225v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11225v2-abstract-full" style="display: none;"> With the rise of e-commerce and short videos, online recommender systems that can capture users&#39; interests and update new items in real-time play an increasingly important role. In both online and offline recommendation, the cold-start problem due to interaction sparsity has been affecting the recommendation effect of cold-start items, which is also known as the long-tail problem of item distribution. Many cold-start scheme based on fine-tuning or knowledge transferring shows excellent performance on offline recommendation. Yet, these schemes are infeasible for online recommendation on streaming data pipelines due to different training method, computational overhead and time constraints. Inspired by the above questions, we propose a model-agnostic recommendation algorithm called Popularity-Aware Meta-learning (PAM), to address the item cold-start problem under streaming data settings. PAM divides the incoming data into different meta-learning tasks by predefined item popularity thresholds. The model can distinguish and reweight behavior-related and content-related features in each task based on their different roles in different popularity levels, thus adapting to recommendations for cold-start samples. These task-fixing design significantly reduces additional computation and storage costs compared to offline methods. Furthermore, PAM also introduced data augmentation and an additional self-supervised loss specifically designed for low-popularity tasks, leveraging insights from high-popularity samples. This approach effectively mitigates the issue of inadequate supervision due to the scarcity of cold-start samples. Experimental results across multiple public datasets demonstrate the superiority of our approach over other baseline methods in addressing cold-start challenges in online streaming data scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11225v2-abstract-full').style.display = 'none'; document.getElementById('2411.11225v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures, to be published in KDD &#39;25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09953">arXiv:2411.09953</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09953">pdf</a>, <a href="https://arxiv.org/format/2411.09953">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Brain-inspired Action Generation with Spiking Transformer Diffusion Policy Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qianhao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Yinqian Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+E">Enmeng Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Y">Yi Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09953v1-abstract-short" style="display: inline;"> Spiking Neural Networks (SNNs) has the ability to extract spatio-temporal features due to their spiking sequence. While previous research has primarily foucus on the classification of image and reinforcement learning. In our paper, we put forward novel diffusion policy model based on Spiking Transformer Neural Networks and Denoising Diffusion Probabilistic Model (DDPM): Spiking Transformer Modulat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09953v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09953v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09953v1-abstract-full" style="display: none;"> Spiking Neural Networks (SNNs) has the ability to extract spatio-temporal features due to their spiking sequence. While previous research has primarily foucus on the classification of image and reinforcement learning. In our paper, we put forward novel diffusion policy model based on Spiking Transformer Neural Networks and Denoising Diffusion Probabilistic Model (DDPM): Spiking Transformer Modulate Diffusion Policy Model (STMDP), a new brain-inspired model for generating robot action trajectories. In order to improve the performance of this model, we develop a novel decoder module: Spiking Modulate De coder (SMD), which replaces the traditional Decoder module within the Transformer architecture. Additionally, we explored the substitution of DDPM with Denoising Diffusion Implicit Models (DDIM) in our frame work. We conducted experiments across four robotic manipulation tasks and performed ablation studies on the modulate block. Our model consistently outperforms existing Transformer-based diffusion policy method. Especially in Can task, we achieved an improvement of 8%. The proposed STMDP method integrates SNNs, dffusion model and Transformer architecture, which offers new perspectives and promising directions for exploration in brain-inspired robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09953v1-abstract-full').style.display = 'none'; document.getElementById('2411.09953v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures and 2 tables, conference submission</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68Q25 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.9 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09600">arXiv:2411.09600</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09600">pdf</a>, <a href="https://arxiv.org/format/2411.09600">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Latency Optimization in LEO Satellite Communications with Hybrid Beam Pattern and Interference Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qianqian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Ye Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Jung%2C+M">Minchae Jung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09600v1-abstract-short" style="display: inline;"> The rapid advancement of low Earth orbit (LEO) satellite communication systems has significantly enhanced global connectivity, offering high-capacity, low-latency services crucial for next-generation applications. However, the dense configuration of LEO constellations poses challenges in resource allocation optimization and interference management, complicating coexistence with other communication&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09600v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09600v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09600v1-abstract-full" style="display: none;"> The rapid advancement of low Earth orbit (LEO) satellite communication systems has significantly enhanced global connectivity, offering high-capacity, low-latency services crucial for next-generation applications. However, the dense configuration of LEO constellations poses challenges in resource allocation optimization and interference management, complicating coexistence with other communication systems. To address these limitations, this paper proposes a novel framework for optimizing the beam scheduling and resource allocation in multi-beam LEO systems. To satisfy the uneven terrestrial traffic demand, a hybrid beam pattern is employed to enhance the downlink quality of service and minimize the transmission latency from LEO satellites to ground user terminals. Additionally, a dynamic co-channel interference (CCI) control mechanism is developed to mitigate inter-beam interference within the LEO constellation and limit cross-system interference affecting protected users from other networks. The problem of user-beam-frequency allocation with power optimization is formulated as a mixed-integer dynamic programming model and solved using a low-complexity neural network-based graph generation algorithm. Simulation results show that the proposed approach outperforms the baseline methods of full frequency reuse and single-channel transmission, and highlights the potential for further performance improvement with multi-user transmissions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09600v1-abstract-full').style.display = 'none'; document.getElementById('2411.09600v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09289">arXiv:2411.09289</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09289">pdf</a>, <a href="https://arxiv.org/format/2411.09289">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StreamAdapter: Efficient Test Time Adaptation from Contextual Streams </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Muhtar%2C+D">Dilxat Muhtar</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Y">Yelong Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yaming Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yadong Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jianfeng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+Y">Yuefeng Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+W">Weiwei Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+F">Feng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xueliang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Weizhu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09289v1-abstract-short" style="display: inline;"> In-context learning (ICL) allows large language models (LLMs) to adapt to new tasks directly from the given demonstrations without requiring gradient updates. While recent advances have expanded context windows to accommodate more demonstrations, this approach increases inference costs without necessarily improving performance. To mitigate these issues, We propose StreamAdapter, a novel approach t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09289v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09289v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09289v1-abstract-full" style="display: none;"> In-context learning (ICL) allows large language models (LLMs) to adapt to new tasks directly from the given demonstrations without requiring gradient updates. While recent advances have expanded context windows to accommodate more demonstrations, this approach increases inference costs without necessarily improving performance. To mitigate these issues, We propose StreamAdapter, a novel approach that directly updates model parameters from context at test time, eliminating the need for explicit in-context demonstrations. StreamAdapter employs context mapping and weight absorption mechanisms to dynamically transform ICL demonstrations into parameter updates with minimal additional parameters. By reducing reliance on numerous in-context examples, StreamAdapter significantly reduce inference costs and allows for efficient inference with constant time complexity, regardless of demonstration count. Extensive experiments across diverse tasks and model architectures demonstrate that StreamAdapter achieves comparable or superior adaptation capability to ICL while requiring significantly fewer demonstrations. The superior task adaptation and context encoding capabilities of StreamAdapter on both language understanding and generation tasks provides a new perspective for adapting LLMs at test time using context, allowing for more efficient adaptation across scenarios and more cost-effective inference <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09289v1-abstract-full').style.display = 'none'; document.getElementById('2411.09289v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 Pages, 9 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08794">arXiv:2411.08794</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08794">pdf</a>, <a href="https://arxiv.org/format/2411.08794">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evaluating World Models with LLM for Decision Making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinrun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+J">Junzhe Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qinggang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xiao Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08794v1-abstract-short" style="display: inline;"> World model emerges as a key module in decision making, where MuZero and Dreamer achieve remarkable successes in complex tasks. Recent work leverages Large Language Models (LLMs) as general world simulators to simulate the dynamics of the world due to their generalizability. LLMs also serve as the world model for deliberative reasoning in Reasoning via Planning (RAP) and Tree of Thought (ToT). How&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08794v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08794v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08794v1-abstract-full" style="display: none;"> World model emerges as a key module in decision making, where MuZero and Dreamer achieve remarkable successes in complex tasks. Recent work leverages Large Language Models (LLMs) as general world simulators to simulate the dynamics of the world due to their generalizability. LLMs also serve as the world model for deliberative reasoning in Reasoning via Planning (RAP) and Tree of Thought (ToT). However, the world models are either evaluated as a general world simulator, or as a functional module of the agent, i.e., predicting the transitions to assist the planning. In this work, we propose a comprehensive evaluation of the world models with LLMs from the decision making perspective. Specifically, we leverage the 31 diverse environments from (Wang et al., 2023;2024) and curate the rule-based policy of each environment for the diverse evaluation. Then, we design three main tasks, i.e., policy verification, action proposal, and policy planning, where the world models can be used for decision making solely. Finally, we conduct the comprehensive evaluation of the advanced LLMs, i.e., GPT-4o and GPT-4o-mini, on the environments for the three main tasks under various settings. The key observations include: i) GPT-4o significantly outperforms GPT-4o-mini on the three main tasks, especially for the tasks which require the domain knowledge, ii) the performance of the world model with LLM will be decreased for long-term decision-making tasks, and iii) the combination of different functionalities of the world model will brings additional unstabilities of the performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08794v1-abstract-full').style.display = 'none'; document.getElementById('2411.08794v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08768">arXiv:2411.08768</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08768">pdf</a>, <a href="https://arxiv.org/format/2411.08768">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Sharingan: Extract User Action Sequence from Desktop Recordings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yanting Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+Y">Yi Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+X">Xiaoting Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+K">Kehong Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+L">Lu Han</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Q">Qingwei Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dongmei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Rajmohan%2C+S">Saravan Rajmohan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08768v1-abstract-short" style="display: inline;"> Video recordings of user activities, particularly desktop recordings, offer a rich source of data for understanding user behaviors and automating processes. However, despite advancements in Vision-Language Models (VLMs) and their increasing use in video analysis, extracting user actions from desktop recordings remains an underexplored area. This paper addresses this gap by proposing two novel VLM-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08768v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08768v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08768v1-abstract-full" style="display: none;"> Video recordings of user activities, particularly desktop recordings, offer a rich source of data for understanding user behaviors and automating processes. However, despite advancements in Vision-Language Models (VLMs) and their increasing use in video analysis, extracting user actions from desktop recordings remains an underexplored area. This paper addresses this gap by proposing two novel VLM-based methods for user action extraction: the Direct Frame-Based Approach (DF), which inputs sampled frames directly into VLMs, and the Differential Frame-Based Approach (DiffF), which incorporates explicit frame differences detected via computer vision techniques. We evaluate these methods using a basic self-curated dataset and an advanced benchmark adapted from prior work. Our results show that the DF approach achieves an accuracy of 70% to 80% in identifying user actions, with the extracted action sequences being re-playable though Robotic Process Automation. We find that while VLMs show potential, incorporating explicit UI changes can degrade performance, making the DF approach more reliable. This work represents the first application of VLMs for extracting user action sequences from desktop recordings, contributing new methods, benchmarks, and insights for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08768v1-abstract-full').style.display = 'none'; document.getElementById('2411.08768v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08466">arXiv:2411.08466</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08466">pdf</a>, <a href="https://arxiv.org/format/2411.08466">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Can MLLMs Guide Weakly-Supervised Temporal Action Localization Tasks? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Quan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+Y">Yuxin Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08466v1-abstract-short" style="display: inline;"> Recent breakthroughs in Multimodal Large Language Models (MLLMs) have gained significant recognition within the deep learning community, where the fusion of the Video Foundation Models (VFMs) and Large Language Models(LLMs) has proven instrumental in constructing robust video understanding systems, effectively surmounting constraints associated with predefined visual tasks. These sophisticated MLL&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08466v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08466v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08466v1-abstract-full" style="display: none;"> Recent breakthroughs in Multimodal Large Language Models (MLLMs) have gained significant recognition within the deep learning community, where the fusion of the Video Foundation Models (VFMs) and Large Language Models(LLMs) has proven instrumental in constructing robust video understanding systems, effectively surmounting constraints associated with predefined visual tasks. These sophisticated MLLMs exhibit remarkable proficiency in comprehending videos, swiftly attaining unprecedented performance levels across diverse benchmarks. However, their operation demands substantial memory and computational resources, underscoring the continued importance of traditional models in video comprehension tasks. In this paper, we introduce a novel learning paradigm termed MLLM4WTAL. This paradigm harnesses the potential of MLLM to offer temporal action key semantics and complete semantic priors for conventional Weakly-supervised Temporal Action Localization (WTAL) methods. MLLM4WTAL facilitates the enhancement of WTAL by leveraging MLLM guidance. It achieves this by integrating two distinct modules: Key Semantic Matching (KSM) and Complete Semantic Reconstruction (CSR). These modules work in tandem to effectively address prevalent issues like incomplete and over-complete outcomes common in WTAL methods. Rigorous experiments are conducted to validate the efficacy of our proposed approach in augmenting the performance of various heterogeneous WTAL models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08466v1-abstract-full').style.display = 'none'; document.getElementById('2411.08466v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07751">arXiv:2411.07751</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07751">pdf</a>, <a href="https://arxiv.org/format/2411.07751">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SAV-SE: Scene-aware Audio-Visual Speech Enhancement with Selective State Space Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qian%2C+X">Xinyuan Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jiaran Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yaodan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiquan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hexin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Garcia%2C+L+P">Leibny Paola Garcia</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07751v1-abstract-short" style="display: inline;"> Speech enhancement plays an essential role in various applications, and the integration of visual information has been demonstrated to bring substantial advantages. However, the majority of current research concentrates on the examination of facial and lip movements, which can be compromised or entirely inaccessible in scenarios where occlusions occur or when the camera view is distant. Whereas co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07751v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07751v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07751v1-abstract-full" style="display: none;"> Speech enhancement plays an essential role in various applications, and the integration of visual information has been demonstrated to bring substantial advantages. However, the majority of current research concentrates on the examination of facial and lip movements, which can be compromised or entirely inaccessible in scenarios where occlusions occur or when the camera view is distant. Whereas contextual visual cues from the surrounding environment have been overlooked: for example, when we see a dog bark, our brain has the innate ability to discern and filter out the barking noise. To this end, in this paper, we introduce a novel task, i.e. SAV-SE. To our best knowledge, this is the first proposal to use rich contextual information from synchronized video as auxiliary cues to indicate the type of noise, which eventually improves the speech enhancement performance. Specifically, we propose the VC-S$^2$E method, which incorporates the Conformer and Mamba modules for their complementary strengths. Extensive experiments are conducted on public MUSIC, AVSpeech and AudioSet datasets, where the results demonstrate the superiority of VC-S$^2$E over other competitive methods. We will make the source code publicly available. Project demo page: https://AVSEPage.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07751v1-abstract-full').style.display = 'none'; document.getElementById('2411.07751v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07690">arXiv:2411.07690</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07690">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> World Models: The Safety Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Z">Zifan Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chongzhe Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sifakis%2C+J">Joseph Sifakis</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qunli Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Peng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07690v1-abstract-short" style="display: inline;"> With the proliferation of the Large Language Model (LLM), the concept of World Models (WM) has recently attracted a great deal of attention in the AI research community, especially in the context of AI agents. It is arguably evolving into an essential foundation for building AI agent systems. A WM is intended to help the agent predict the future evolution of environmental states or help the agent&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07690v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07690v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07690v1-abstract-full" style="display: none;"> With the proliferation of the Large Language Model (LLM), the concept of World Models (WM) has recently attracted a great deal of attention in the AI research community, especially in the context of AI agents. It is arguably evolving into an essential foundation for building AI agent systems. A WM is intended to help the agent predict the future evolution of environmental states or help the agent fill in missing information so that it can plan its actions and behave safely. The safety property of WM plays a key role in their effective use in critical applications. In this work, we review and analyze the impacts of the current state-of-the-art in WM technology from the point of view of trustworthiness and safety based on a comprehensive survey and the fields of application envisaged. We provide an in-depth analysis of state-of-the-art WMs and derive technical research challenges and their impact in order to call on the research community to collaborate on improving the safety and trustworthiness of WM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07690v1-abstract-full').style.display = 'none'; document.getElementById('2411.07690v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures, accepted at the International Workshop on Dependability Modeling and Design (WDMD) during the IEEE International Symposium on Software Reliability Engineering (ISSRE)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07618">arXiv:2411.07618</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07618">pdf</a>, <a href="https://arxiv.org/format/2411.07618">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Direct Preference Optimization Using Sparse Feature-Level Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yin%2C+Q">Qingyu Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Leong%2C+C+T">Chak Tou Leong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hongbo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+M">Minjun Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+H">Hanqi Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yulan He</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Linyi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07618v1-abstract-short" style="display: inline;"> The alignment of large language models (LLMs) with human preferences remains a key challenge. While post-training techniques like Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO) have achieved notable success, they often introduce computational inefficiencies and training instability. In this paper, we propose Feature-level constrained Preference Optimizat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07618v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07618v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07618v1-abstract-full" style="display: none;"> The alignment of large language models (LLMs) with human preferences remains a key challenge. While post-training techniques like Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO) have achieved notable success, they often introduce computational inefficiencies and training instability. In this paper, we propose Feature-level constrained Preference Optimization (FPO), a novel method designed to simplify the alignment process while ensuring stability. FPO leverages pre-trained Sparse Autoencoders (SAEs) and introduces feature-level constraints, allowing for efficient, sparsity-enforced alignment. Our approach enjoys efficiency by using sparse features activated in a well-trained sparse autoencoder and the quality of sequential KL divergence by using the feature-level offline reference. Experimental results on benchmark datasets demonstrate that FPO achieves a 5.08% absolute improvement in win rate with much lower computational cost compared to state-of-the-art baselines, making it a promising solution for efficient and controllable LLM alignments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07618v1-abstract-full').style.display = 'none'; document.getElementById('2411.07618v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07483">arXiv:2411.07483</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07483">pdf</a>, <a href="https://arxiv.org/format/2411.07483">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Quantifying Knowledge Distillation Using Partial Information Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dissanayake%2C+P">Pasan Dissanayake</a>, <a href="/search/cs?searchtype=author&amp;query=Hamman%2C+F">Faisal Hamman</a>, <a href="/search/cs?searchtype=author&amp;query=Halder%2C+B">Barproda Halder</a>, <a href="/search/cs?searchtype=author&amp;query=Sucholutsky%2C+I">Ilia Sucholutsky</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiuyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Dutta%2C+S">Sanghamitra Dutta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07483v1-abstract-short" style="display: inline;"> Knowledge distillation provides an effective method for deploying complex machine learning models in resource-constrained environments. It typically involves training a smaller student model to emulate either the probabilistic outputs or the internal feature representations of a larger teacher model. By doing so, the student model often achieves substantially better performance on a downstream tas&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07483v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07483v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07483v1-abstract-full" style="display: none;"> Knowledge distillation provides an effective method for deploying complex machine learning models in resource-constrained environments. It typically involves training a smaller student model to emulate either the probabilistic outputs or the internal feature representations of a larger teacher model. By doing so, the student model often achieves substantially better performance on a downstream task compared to when it is trained independently. Nevertheless, the teacher&#39;s internal representations can also encode noise or additional information that may not be relevant to the downstream task. This observation motivates our primary question: What are the information-theoretic limits of knowledge transfer? To this end, we leverage a body of work in information theory called Partial Information Decomposition (PID) to quantify the distillable and distilled knowledge of a teacher&#39;s representation corresponding to a given student and a downstream task. Moreover, we demonstrate that this metric can be practically used in distillation to address challenges caused by the complexity gap between the teacher and the student representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07483v1-abstract-full').style.display = 'none'; document.getElementById('2411.07483v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024 Machine Learning and Compression Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07478">arXiv:2411.07478</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07478">pdf</a>, <a href="https://arxiv.org/format/2411.07478">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GUS-IR: Gaussian Splatting with Unified Shading for Inverse Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Z">Zhihao Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hongdong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+K">Kui Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+K">Kailing Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07478v1-abstract-short" style="display: inline;"> Recovering the intrinsic physical attributes of a scene from images, generally termed as the inverse rendering problem, has been a central and challenging task in computer vision and computer graphics. In this paper, we present GUS-IR, a novel framework designed to address the inverse rendering problem for complicated scenes featuring rough and glossy surfaces. This paper starts by analyzing and c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07478v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07478v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07478v1-abstract-full" style="display: none;"> Recovering the intrinsic physical attributes of a scene from images, generally termed as the inverse rendering problem, has been a central and challenging task in computer vision and computer graphics. In this paper, we present GUS-IR, a novel framework designed to address the inverse rendering problem for complicated scenes featuring rough and glossy surfaces. This paper starts by analyzing and comparing two prominent shading techniques popularly used for inverse rendering, forward shading and deferred shading, effectiveness in handling complex materials. More importantly, we propose a unified shading solution that combines the advantages of both techniques for better decomposition. In addition, we analyze the normal modeling in 3D Gaussian Splatting (3DGS) and utilize the shortest axis as normal for each particle in GUS-IR, along with a depth-related regularization, resulting in improved geometric representation and better shape reconstruction. Furthermore, we enhance the probe-based baking scheme proposed by GS-IR to achieve more accurate ambient occlusion modeling to better handle indirect illumination. Extensive experiments have demonstrated the superior performance of GUS-IR in achieving precise intrinsic decomposition and geometric representation, supporting many downstream tasks (such as relighting, retouching) in computer vision, graphics, and extended reality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07478v1-abstract-full').style.display = 'none'; document.getElementById('2411.07478v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07135">arXiv:2411.07135</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07135">pdf</a>, <a href="https://arxiv.org/format/2411.07135">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Edify 3D: Scalable High-Quality 3D Asset Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=NVIDIA"> NVIDIA</a>, <a href="/search/cs?searchtype=author&amp;query=%3A"> :</a>, <a href="/search/cs?searchtype=author&amp;query=Bala%2C+M">Maciej Bala</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Y">Yin Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Y">Yifan Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Ge%2C+Y">Yunhao Ge</a>, <a href="/search/cs?searchtype=author&amp;query=Hao%2C+Z">Zekun Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Hasselgren%2C+J">Jon Hasselgren</a>, <a href="/search/cs?searchtype=author&amp;query=Huffman%2C+J">Jacob Huffman</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+J">Jingyi Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Lewis%2C+J+P">J. P. Lewis</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhaoshuo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+C">Chen-Hsuan Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yen-Chen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+T">Tsung-Yi Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Ming-Yu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+A">Alice Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Q">Qianli Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Munkberg%2C+J">Jacob Munkberg</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+S">Stella Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+F">Fangyin Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+D">Donglai Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jiashu Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+X">Xiaohui Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qinsheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07135v1-abstract-short" style="display: inline;"> We introduce Edify 3D, an advanced solution designed for high-quality 3D asset generation. Our method first synthesizes RGB and surface normal images of the described object at multiple viewpoints using a diffusion model. The multi-view observations are then used to reconstruct the shape, texture, and PBR materials of the object. Our method can generate high-quality 3D assets with detailed geometr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07135v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07135v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07135v1-abstract-full" style="display: none;"> We introduce Edify 3D, an advanced solution designed for high-quality 3D asset generation. Our method first synthesizes RGB and surface normal images of the described object at multiple viewpoints using a diffusion model. The multi-view observations are then used to reconstruct the shape, texture, and PBR materials of the object. Our method can generate high-quality 3D assets with detailed geometry, clean shape topologies, high-resolution textures, and materials within 2 minutes of runtime. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07135v1-abstract-full').style.display = 'none'; document.getElementById('2411.07135v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://research.nvidia.com/labs/dir/edify-3d</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07126">arXiv:2411.07126</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07126">pdf</a>, <a href="https://arxiv.org/format/2411.07126">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Edify Image: High-Quality Image Generation with Pixel Space Laplacian Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=NVIDIA"> NVIDIA</a>, <a href="/search/cs?searchtype=author&amp;query=%3A"> :</a>, <a href="/search/cs?searchtype=author&amp;query=Atzmon%2C+Y">Yuval Atzmon</a>, <a href="/search/cs?searchtype=author&amp;query=Bala%2C+M">Maciej Bala</a>, <a href="/search/cs?searchtype=author&amp;query=Balaji%2C+Y">Yogesh Balaji</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+T">Tiffany Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Y">Yin Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+J">Jiaojiao Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Ge%2C+Y">Yunhao Ge</a>, <a href="/search/cs?searchtype=author&amp;query=Gururani%2C+S">Siddharth Gururani</a>, <a href="/search/cs?searchtype=author&amp;query=Huffman%2C+J">Jacob Huffman</a>, <a href="/search/cs?searchtype=author&amp;query=Isaac%2C+R">Ronald Isaac</a>, <a href="/search/cs?searchtype=author&amp;query=Jannaty%2C+P">Pooya Jannaty</a>, <a href="/search/cs?searchtype=author&amp;query=Karras%2C+T">Tero Karras</a>, <a href="/search/cs?searchtype=author&amp;query=Lam%2C+G">Grace Lam</a>, <a href="/search/cs?searchtype=author&amp;query=Lewis%2C+J+P">J. P. Lewis</a>, <a href="/search/cs?searchtype=author&amp;query=Licata%2C+A">Aaron Licata</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yen-Chen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Ming-Yu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Q">Qianli Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Mallya%2C+A">Arun Mallya</a>, <a href="/search/cs?searchtype=author&amp;query=Martino-Tarr%2C+A">Ashlee Martino-Tarr</a>, <a href="/search/cs?searchtype=author&amp;query=Mendez%2C+D">Doug Mendez</a>, <a href="/search/cs?searchtype=author&amp;query=Nah%2C+S">Seungjun Nah</a>, <a href="/search/cs?searchtype=author&amp;query=Pruett%2C+C">Chris Pruett</a> , et al. (7 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07126v1-abstract-short" style="display: inline;"> We introduce Edify Image, a family of diffusion models capable of generating photorealistic image content with pixel-perfect accuracy. Edify Image utilizes cascaded pixel-space diffusion models trained using a novel Laplacian diffusion process, in which image signals at different frequency bands are attenuated at varying rates. Edify Image supports a wide range of applications, including text-to-i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07126v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07126v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07126v1-abstract-full" style="display: none;"> We introduce Edify Image, a family of diffusion models capable of generating photorealistic image content with pixel-perfect accuracy. Edify Image utilizes cascaded pixel-space diffusion models trained using a novel Laplacian diffusion process, in which image signals at different frequency bands are attenuated at varying rates. Edify Image supports a wide range of applications, including text-to-image synthesis, 4K upsampling, ControlNets, 360 HDR panorama generation, and finetuning for image customization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07126v1-abstract-full').style.display = 'none'; document.getElementById('2411.07126v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07057">arXiv:2411.07057</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07057">pdf</a>, <a href="https://arxiv.org/format/2411.07057">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Randomized Forward Mode Gradient for Spiking Neural Networks in Scientific Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+R">Ruyin Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Karniadakis%2C+G+E">George Em Karniadakis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07057v1-abstract-short" style="display: inline;"> Spiking neural networks (SNNs) represent a promising approach in machine learning, combining the hierarchical learning capabilities of deep neural networks with the energy efficiency of spike-based computations. Traditional end-to-end training of SNNs is often based on back-propagation, where weight updates are derived from gradients computed through the chain rule. However, this method encounters&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07057v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07057v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07057v1-abstract-full" style="display: none;"> Spiking neural networks (SNNs) represent a promising approach in machine learning, combining the hierarchical learning capabilities of deep neural networks with the energy efficiency of spike-based computations. Traditional end-to-end training of SNNs is often based on back-propagation, where weight updates are derived from gradients computed through the chain rule. However, this method encounters challenges due to its limited biological plausibility and inefficiencies on neuromorphic hardware. In this study, we introduce an alternative training approach for SNNs. Instead of using back-propagation, we leverage weight perturbation methods within a forward-mode gradient framework. Specifically, we perturb the weight matrix with a small noise term and estimate gradients by observing the changes in the network output. Experimental results on regression tasks, including solving various PDEs, show that our approach achieves competitive accuracy, suggesting its suitability for neuromorphic systems and potential hardware compatibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07057v1-abstract-full').style.display = 'none'; document.getElementById('2411.07057v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05655">arXiv:2411.05655</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05655">pdf</a>, <a href="https://arxiv.org/format/2411.05655">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Joint Age and Coverage-Optimal Satellite Constellation Relaying in Cislunar Communications with Hybrid Orbits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+A">Afang Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Z">Zhouyong Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhili Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qinyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhihua Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05655v1-abstract-short" style="display: inline;"> With the ever-increasing lunar missions, a growing interest develops in designing data relay satellite constellations for cislunar communications, which is challenged by the constrained visibility and huge distance between the earth and moon in pursuit of establishing real-time communication links. In this work, therefore, we propose an age and coverage optimal relay satellite constellation for ci&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05655v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05655v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05655v1-abstract-full" style="display: none;"> With the ever-increasing lunar missions, a growing interest develops in designing data relay satellite constellations for cislunar communications, which is challenged by the constrained visibility and huge distance between the earth and moon in pursuit of establishing real-time communication links. In this work, therefore, we propose an age and coverage optimal relay satellite constellation for cislunar communication by considering the self-rotation of the earth as well as the orbital motion of the moon, which consists of hybrid Earth-Moon Libration 1/2 (EML1/L2) points Halo orbits, ordinary lunar orbits, and Geostationary Earth Orbit (GEO) satellites. In particular, by minimizing both the number of satellites and the average per-device Age of Information (AoI) while maximizing the coverage ratio of specific lunar surface regions, a multi-objective optimization problem is formulated and solved by using a well-designed Nondominated Sorting Genetic Algorithm-II (NSGA-II). The simulation results demonstrate that our proposed hybrid constellation significantly outperforms traditional Walker Star and Delta constellations in terms of both AoI and the coverage of communication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05655v1-abstract-full').style.display = 'none'; document.getElementById('2411.05655v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13pages,10figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04568">arXiv:2411.04568</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04568">pdf</a>, <a href="https://arxiv.org/format/2411.04568">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-Attention-based EEG State Transition Modeling for Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shen%2C+X">Xinke Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Gan%2C+R">Runmin Gan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kaixuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shuyi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qingzhu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Quanying Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+S">Sen Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04568v1-abstract-short" style="display: inline;"> Electroencephalogram (EEG)-based emotion decoding can objectively quantify people&#39;s emotional state and has broad application prospects in human-computer interaction and early detection of emotional disorders. Recently emerging deep learning architectures have significantly improved the performance of EEG emotion decoding. However, existing methods still fall short of fully capturing the complex s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04568v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04568v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04568v1-abstract-full" style="display: none;"> Electroencephalogram (EEG)-based emotion decoding can objectively quantify people&#39;s emotional state and has broad application prospects in human-computer interaction and early detection of emotional disorders. Recently emerging deep learning architectures have significantly improved the performance of EEG emotion decoding. However, existing methods still fall short of fully capturing the complex spatiotemporal dynamics of neural signals, which are crucial for representing emotion processing. This study proposes a Dynamic-Attention-based EEG State Transition (DAEST) modeling method to characterize EEG spatiotemporal dynamics. The model extracts spatiotemporal components of EEG that represent multiple parallel neural processes and estimates dynamic attention weights on these components to capture transitions in brain states. The model is optimized within a contrastive learning framework for cross-subject emotion recognition. The proposed method achieved state-of-the-art performance on three publicly available datasets: FACED, SEED, and SEED-V. It achieved 75.4% accuracy in the binary classification of positive and negative emotions and 59.3% in nine-class discrete emotion classification on the FACED dataset, 88.1% in the three-class classification of positive, negative, and neutral emotions on the SEED dataset, and 73.6% in five-class discrete emotion classification on the SEED-V dataset. The learned EEG spatiotemporal patterns and dynamic transition properties offer valuable insights into neural dynamics underlying emotion processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04568v1-abstract-full').style.display = 'none'; document.getElementById('2411.04568v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04335">arXiv:2411.04335</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04335">pdf</a>, <a href="https://arxiv.org/format/2411.04335">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GazeGen: Gaze-Driven User Interaction for Visual Content Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsieh%2C+H">He-Yen Hsieh</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Ziyun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S+Q">Sai Qian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ting%2C+W+M">Wei-Te Mark Ting</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+K">Kao-Den Chang</a>, <a href="/search/cs?searchtype=author&amp;query=De+Salvo%2C+B">Barbara De Salvo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Chiao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Kung%2C+H+T">H. T. Kung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04335v2-abstract-short" style="display: inline;"> We present GazeGen, a user interaction system that generates visual content (images and videos) for locations indicated by the user&#39;s eye gaze. GazeGen allows intuitive manipulation of visual content by targeting regions of interest with gaze. Using advanced techniques in object detection and generative AI, GazeGen performs gaze-controlled image adding/deleting, repositioning, and surface style ch&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04335v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04335v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04335v2-abstract-full" style="display: none;"> We present GazeGen, a user interaction system that generates visual content (images and videos) for locations indicated by the user&#39;s eye gaze. GazeGen allows intuitive manipulation of visual content by targeting regions of interest with gaze. Using advanced techniques in object detection and generative AI, GazeGen performs gaze-controlled image adding/deleting, repositioning, and surface style changes of image objects, and converts static images into videos. Central to GazeGen is the DFT Gaze (Distilled and Fine-Tuned Gaze) agent, an ultra-lightweight model with only 281K parameters, performing accurate real-time gaze predictions tailored to individual users&#39; eyes on small edge devices. GazeGen is the first system to combine visual content generation with real-time gaze estimation, made possible exclusively by DFT Gaze. This real-time gaze estimation enables various visual content generation tasks, all controlled by the user&#39;s gaze. The input for DFT Gaze is the user&#39;s eye images, while the inputs for visual content generation are the user&#39;s view and the predicted gaze point from DFT Gaze. To achieve efficient gaze predictions, we derive the small model from a large model (10x larger) via novel knowledge distillation and personal adaptation techniques. We integrate knowledge distillation with a masked autoencoder, developing a compact yet powerful gaze estimation model. This model is further fine-tuned with Adapters, enabling highly accurate and personalized gaze predictions with minimal user input. DFT Gaze ensures low-latency and precise gaze tracking, supporting a wide range of gaze-driven tasks. We validate the performance of DFT Gaze on AEA and OpenEDS2020 benchmarks, demonstrating low angular gaze error and low latency on the edge device (Raspberry Pi 4). Furthermore, we describe applications of GazeGen, illustrating its versatility and effectiveness in various usage scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04335v2-abstract-full').style.display = 'none'; document.getElementById('2411.04335v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03349">arXiv:2411.03349</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03349">pdf</a>, <a href="https://arxiv.org/format/2411.03349">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RuAG: Learned-rule-augmented Generation for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yudi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+P">Pei Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chaoyun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+Y">Yali Du</a>, <a href="/search/cs?searchtype=author&amp;query=Puzyrev%2C+Y">Yevgeniy Puzyrev</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+R">Randolph Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+S">Si Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Q">Qingwei Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Pechenizkiy%2C+M">Mykola Pechenizkiy</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dongmei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Rajmohan%2C+S">Saravan Rajmohan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03349v1-abstract-short" style="display: inline;"> In-context learning (ICL) and Retrieval-Augmented Generation (RAG) have gained attention for their ability to enhance LLMs&#39; reasoning by incorporating external knowledge but suffer from limited contextual window size, leading to insufficient information injection. To this end, we propose a novel framework, RuAG, to automatically distill large volumes of offline data into interpretable first-order&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03349v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03349v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03349v1-abstract-full" style="display: none;"> In-context learning (ICL) and Retrieval-Augmented Generation (RAG) have gained attention for their ability to enhance LLMs&#39; reasoning by incorporating external knowledge but suffer from limited contextual window size, leading to insufficient information injection. To this end, we propose a novel framework, RuAG, to automatically distill large volumes of offline data into interpretable first-order logic rules, which are injected into LLMs to boost their reasoning capabilities. Our method begins by formulating the search process relying on LLMs&#39; commonsense, where LLMs automatically define head and body predicates. Then, RuAG applies Monte Carlo Tree Search (MCTS) to address the combinational searching space and efficiently discover logic rules from data. The resulting logic rules are translated into natural language, allowing targeted knowledge injection and seamless integration into LLM prompts for LLM&#39;s downstream task reasoning. We evaluate our framework on public and private industrial tasks, including natural language processing, time-series, decision-making, and industrial tasks, demonstrating its effectiveness in enhancing LLM&#39;s capability over diverse tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03349v1-abstract-full').style.display = 'none'; document.getElementById('2411.03349v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02199">arXiv:2411.02199</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02199">pdf</a>, <a href="https://arxiv.org/format/2411.02199">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Provably Transformers Harness Multi-Concept Word Semantics for Efficient In-Context Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bu%2C+D">Dake Bu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Wei Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+A">Andi Han</a>, <a href="/search/cs?searchtype=author&amp;query=Nitanda%2C+A">Atsushi Nitanda</a>, <a href="/search/cs?searchtype=author&amp;query=Suzuki%2C+T">Taiji Suzuki</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qingfu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+H">Hau-San Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02199v4-abstract-short" style="display: inline;"> Transformer-based large language models (LLMs) have displayed remarkable creative prowess and emergence capabilities. Existing empirical studies have revealed a strong connection between these LLMs&#39; impressive emergence abilities and their in-context learning (ICL) capacity, allowing them to solve new tasks using only task-specific prompts without further fine-tuning. On the other hand, existing e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02199v4-abstract-full').style.display = 'inline'; document.getElementById('2411.02199v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02199v4-abstract-full" style="display: none;"> Transformer-based large language models (LLMs) have displayed remarkable creative prowess and emergence capabilities. Existing empirical studies have revealed a strong connection between these LLMs&#39; impressive emergence abilities and their in-context learning (ICL) capacity, allowing them to solve new tasks using only task-specific prompts without further fine-tuning. On the other hand, existing empirical and theoretical studies also show that there is a linear regularity of the multi-concept encoded semantic representation behind transformer-based LLMs. However, existing theoretical work fail to build up an understanding of the connection between this regularity and the innovative power of ICL. Additionally, prior work often focuses on simplified, unrealistic scenarios involving linear transformers or unrealistic loss functions, and they achieve only linear or sub-linear convergence rates. In contrast, this work provides a fine-grained mathematical analysis to show how transformers leverage the multi-concept semantics of words to enable powerful ICL and excellent out-of-distribution ICL abilities, offering insights into how transformers innovate solutions for certain unseen tasks encoded with multiple cross-concept semantics. Inspired by empirical studies on the linear latent geometry of LLMs, the analysis is based on a concept-based low-noise sparse coding prompt model. Leveraging advanced techniques, this work showcases the exponential 0-1 loss convergence over the highly non-convex training dynamics, which pioneeringly incorporates the challenges of softmax self-attention, ReLU-activated MLPs, and cross-entropy loss. Empirical simulations corroborate the theoretical findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02199v4-abstract-full').style.display = 'none'; document.getElementById('2411.02199v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02059">arXiv:2411.02059</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02059">pdf</a>, <a href="https://arxiv.org/format/2411.02059">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> TableGPT2: A Large Multimodal Model with Tabular Data Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Su%2C+A">Aofeng Su</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+A">Aowen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+C">Chao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+C">Chen Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+G">Ga Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Gang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+G">Guangcheng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haobo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Haokai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haoze Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+H">Haoxuan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+J">Jiaming Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+J">Jing Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Junbo Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Junlin Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Shou%2C+K">Kaizhe Shou</a>, <a href="/search/cs?searchtype=author&amp;query=Zha%2C+L">Liangyu Zha</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+L">Lin Long</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Liyao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+P">Pengzuo Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Q">Qingyi Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Saisai Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tao Zhang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02059v3-abstract-short" style="display: inline;"> The emergence of models like GPTs, Claude, LLaMA, and Qwen has reshaped AI applications, presenting vast new opportunities across industries. Yet, the integration of tabular data remains notably underdeveloped, despite its foundational role in numerous real-world domains. This gap is critical for three main reasons. First, database or data warehouse data integration is essential for advanced app&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02059v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02059v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02059v3-abstract-full" style="display: none;"> The emergence of models like GPTs, Claude, LLaMA, and Qwen has reshaped AI applications, presenting vast new opportunities across industries. Yet, the integration of tabular data remains notably underdeveloped, despite its foundational role in numerous real-world domains. This gap is critical for three main reasons. First, database or data warehouse data integration is essential for advanced applications; second, the vast and largely untapped resource of tabular data offers immense potential for analysis; and third, the business intelligence domain specifically demands adaptable, precise solutions that many current LLMs may struggle to provide. In response, we introduce TableGPT2, a model rigorously pre-trained and fine-tuned with over 593.8K tables and 2.36M high-quality query-table-output tuples, a scale of table-related data unprecedented in prior research. This extensive training enables TableGPT2 to excel in table-centric tasks while maintaining strong general language and coding abilities. One of TableGPT2&#39;s key innovations is its novel table encoder, specifically designed to capture schema-level and cell-level information. This encoder strengthens the model&#39;s ability to handle ambiguous queries, missing column names, and irregular tables commonly encountered in real-world applications. Similar to visual language models, this pioneering approach integrates with the decoder to form a robust large multimodal model. We believe the results are compelling: over 23 benchmarking metrics, TableGPT2 achieves an average performance improvement of 35.20% in the 7B model and 49.32% in the 72B model over prior benchmark-neutral LLMs, with robust general-purpose capabilities intact. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02059v3-abstract-full').style.display = 'none'; document.getElementById('2411.02059v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02028">arXiv:2411.02028</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02028">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> An Immediate Update Strategy of Multi-State Constraint Kalman Filter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qingchao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ouyang%2C+W">Wei Ouyang</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+J">Jiale Han</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Q">Qi Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+M">Maoran Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yuanxin Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02028v1-abstract-short" style="display: inline;"> The lightweight Multi-state Constraint Kalman Filter (MSCKF) has been well-known for its high efficiency, in which the delayed update has been usually adopted since its proposal. This work investigates the immediate update strategy of MSCKF based on timely reconstructed 3D feature points and measurement constraints. The differences between the delayed update and the immediate update are theoretica&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02028v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02028v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02028v1-abstract-full" style="display: none;"> The lightweight Multi-state Constraint Kalman Filter (MSCKF) has been well-known for its high efficiency, in which the delayed update has been usually adopted since its proposal. This work investigates the immediate update strategy of MSCKF based on timely reconstructed 3D feature points and measurement constraints. The differences between the delayed update and the immediate update are theoretically analyzed in detail. It is found that the immediate update helps construct more observation constraints and employ more filtering updates than the delayed update, which improves the linearization point of the measurement model and therefore enhances the estimation accuracy. Numerical simulations and experiments show that the immediate update strategy significantly enhances MSCKF even with a small amount of feature observations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02028v1-abstract-full').style.display = 'none'; document.getElementById('2411.02028v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01623">arXiv:2411.01623</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01623">pdf</a>, <a href="https://arxiv.org/format/2411.01623">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> FilterNet: Harnessing Frequency Filters for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yi%2C+K">Kun Yi</a>, <a href="/search/cs?searchtype=author&amp;query=Fei%2C+J">Jingru Fei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+H">Hui He</a>, <a href="/search/cs?searchtype=author&amp;query=Hao%2C+S">Shufeng Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Lian%2C+D">Defu Lian</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+W">Wei Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01623v2-abstract-short" style="display: inline;"> While numerous forecasters have been proposed using different network architectures, the Transformer-based models have state-of-the-art performance in time series forecasting. However, forecasters based on Transformers are still suffering from vulnerability to high-frequency signals, efficiency in computation, and bottleneck in full-spectrum utilization, which essentially are the cornerstones for&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01623v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01623v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01623v2-abstract-full" style="display: none;"> While numerous forecasters have been proposed using different network architectures, the Transformer-based models have state-of-the-art performance in time series forecasting. However, forecasters based on Transformers are still suffering from vulnerability to high-frequency signals, efficiency in computation, and bottleneck in full-spectrum utilization, which essentially are the cornerstones for accurately predicting time series with thousands of points. In this paper, we explore a novel perspective of enlightening signal processing for deep time series forecasting. Inspired by the filtering process, we introduce one simple yet effective network, namely FilterNet, built upon our proposed learnable frequency filters to extract key informative temporal patterns by selectively passing or attenuating certain components of time series signals. Concretely, we propose two kinds of learnable filters in the FilterNet: (i) Plain shaping filter, that adopts a universal frequency kernel for signal filtering and temporal modeling; (ii) Contextual shaping filter, that utilizes filtered frequencies examined in terms of its compatibility with input signals for dependency learning. Equipped with the two filters, FilterNet can approximately surrogate the linear and attention mappings widely adopted in time series literature, while enjoying superb abilities in handling high-frequency noises and utilizing the whole frequency spectrum that is beneficial for forecasting. Finally, we conduct extensive experiments on eight time series forecasting benchmarks, and experimental results have demonstrated our superior performance in terms of both effectiveness and efficiency compared with state-of-the-art methods. Code is available at this repository: https://github.com/aikunyi/FilterNet <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01623v2-abstract-full').style.display = 'none'; document.getElementById('2411.01623v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01426">arXiv:2411.01426</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01426">pdf</a>, <a href="https://arxiv.org/format/2411.01426">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> AURA: Amplifying Understanding, Resilience, and Awareness for Responsible AI Content Work </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+A+Q">Alice Qian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Amores%2C+J">Judith Amores</a>, <a href="/search/cs?searchtype=author&amp;query=Gray%2C+M+L">Mary L. Gray</a>, <a href="/search/cs?searchtype=author&amp;query=Czerwinski%2C+M">Mary Czerwinski</a>, <a href="/search/cs?searchtype=author&amp;query=Suh%2C+J">Jina Suh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01426v1-abstract-short" style="display: inline;"> Behind the scenes of maintaining the safety of technology products from harmful and illegal digital content lies unrecognized human labor. The recent rise in the use of generative AI technologies and the accelerating demands to meet responsible AI (RAI) aims necessitates an increased focus on the labor behind such efforts in the age of AI. This study investigates the nature and challenges of conte&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01426v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01426v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01426v1-abstract-full" style="display: none;"> Behind the scenes of maintaining the safety of technology products from harmful and illegal digital content lies unrecognized human labor. The recent rise in the use of generative AI technologies and the accelerating demands to meet responsible AI (RAI) aims necessitates an increased focus on the labor behind such efforts in the age of AI. This study investigates the nature and challenges of content work that supports RAI efforts, or &#34;RAI content work,&#34; that span content moderation, data labeling, and red teaming -- through the lived experiences of content workers. We conduct a formative survey and semi-structured interview studies to develop a conceptualization of RAI content work and a subsequent framework of recommendations for providing holistic support for content workers. We validate our recommendations through a series of workshops with content workers and derive considerations for and examples of implementing such recommendations. We discuss how our framework may guide future innovation to support the well-being and professional development of the RAI content workforce. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01426v1-abstract-full').style.display = 'none'; document.getElementById('2411.01426v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be presented at CSCW 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00750">arXiv:2411.00750</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00750">pdf</a>, <a href="https://arxiv.org/format/2411.00750">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Tail Narrowing in LLM Self-Improvement via Socratic-Guided Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Y">Yiwen Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Xi%2C+Z">Zhiheng Xi</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+W">Wei He</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhuoyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+Y">Yitao Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+X">Xiaowei Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+X">Xunliang Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Gui%2C+T">Tao Gui</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xuanjing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00750v1-abstract-short" style="display: inline;"> Self-improvement methods enable large language models (LLMs) to generate solutions themselves and iteratively train on filtered, high-quality rationales. This process proves effective and reduces the reliance on human supervision in LLMs&#39; reasoning, but the performance soon plateaus. We delve into the process and find that models tend to over-sample on easy queries and under-sample on queries they&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00750v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00750v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00750v1-abstract-full" style="display: none;"> Self-improvement methods enable large language models (LLMs) to generate solutions themselves and iteratively train on filtered, high-quality rationales. This process proves effective and reduces the reliance on human supervision in LLMs&#39; reasoning, but the performance soon plateaus. We delve into the process and find that models tend to over-sample on easy queries and under-sample on queries they have yet to master. As iterations proceed, this imbalance in sampling is exacerbated, leading to a long-tail distribution where solutions to difficult queries almost diminish. This phenomenon limits the performance gain of self-improving models. A straightforward solution is brute-force sampling to balance the distribution, which significantly raises computational costs. In this paper, we introduce Guided Self-Improvement (GSI), a strategy aimed at improving the efficiency of sampling challenging heavy-tailed data. It leverages Socratic-style guidance signals to help LLM reasoning with complex queries, reducing the exploration effort and minimizing computational overhead. Experiments on four models across diverse mathematical tasks show that GSI strikes a balance between performance and efficiency, while also being effective on held-out tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00750v1-abstract-full').style.display = 'none'; document.getElementById('2411.00750v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Codes are publicly available at https://github.com/Yiwen-Ding/Guided-Self-Improvement</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00722">arXiv:2411.00722</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00722">pdf</a>, <a href="https://arxiv.org/format/2411.00722">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Token-level Proximal Policy Optimization for Query Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ouyang%2C+Y">Yichen Ouyang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+F">Fangkai Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+P">Pu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Chenghua Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jianfeng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Pang%2C+B">Bochen Pang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yaming Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+Y">Yuefeng Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Q">Qingwei Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Rajmohan%2C+S">Saravan Rajmohan</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+W">Weiwei Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dongmei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+F">Feng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00722v1-abstract-short" style="display: inline;"> Query generation is a critical task for web search engines (e.g. Google, Bing) and recommendation systems. Recently, state-of-the-art query generation methods leverage Large Language Models (LLMs) for their strong capabilities in context understanding and text generation. However, they still face challenges in generating high-quality queries in terms of inferring user intent based on their web sea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00722v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00722v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00722v1-abstract-full" style="display: none;"> Query generation is a critical task for web search engines (e.g. Google, Bing) and recommendation systems. Recently, state-of-the-art query generation methods leverage Large Language Models (LLMs) for their strong capabilities in context understanding and text generation. However, they still face challenges in generating high-quality queries in terms of inferring user intent based on their web search interaction history. In this paper, we propose Token-level Proximal Policy Optimization (TPPO), a noval approach designed to empower LLMs perform better in query generation through fine-tuning. TPPO is based on the Reinforcement Learning from AI Feedback (RLAIF) paradigm, consisting of a token-level reward model and a token-level proximal policy optimization module to address the sparse reward challenge in traditional RLAIF frameworks. To evaluate the effectiveness and robustness of TPPO, we conducted experiments on both open-source dataset and an industrial dataset that was collected from a globally-used search engine. The experimental results demonstrate that TPPO significantly improves the performance of query generation for LLMs and outperforms its existing competitors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00722v1-abstract-full').style.display = 'none'; document.getElementById('2411.00722v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00418">arXiv:2411.00418</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00418">pdf</a>, <a href="https://arxiv.org/format/2411.00418">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Self-Evolved Reward Learning for LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Chenghua Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+Z">Zhizhen Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+F">Fangkai Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+P">Pu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zeqi Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Q">Qingwei Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dongmei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Rajmohan%2C+S">Saravan Rajmohan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00418v1-abstract-short" style="display: inline;"> Reinforcement Learning from Human Feedback (RLHF) is a crucial technique for aligning language models with human preferences, playing a pivotal role in the success of conversational models like GPT-4, ChatGPT, and Llama 2. A core challenge in employing RLHF lies in training a reliable reward model (RM), which relies on high-quality labels typically provided by human experts or advanced AI system.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00418v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00418v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00418v1-abstract-full" style="display: none;"> Reinforcement Learning from Human Feedback (RLHF) is a crucial technique for aligning language models with human preferences, playing a pivotal role in the success of conversational models like GPT-4, ChatGPT, and Llama 2. A core challenge in employing RLHF lies in training a reliable reward model (RM), which relies on high-quality labels typically provided by human experts or advanced AI system. These methods can be costly and may introduce biases that affect the language model&#39;s responses. As language models improve, human input may become less effective in further enhancing their performance. In this paper, we propose Self-Evolved Reward Learning (SER), a novel approach where the RM generates additional training data to iteratively improve itself. We conducted extensive experiments on multiple datasets such as HH-RLHF and UltraFeedback, using models like Mistral and Llama 3, and compare SER against various baselines. Our results demonstrate that even with limited human-annotated data, learning from self-feedback can robustly enhance RM performance, thereby boosting the capabilities of large language models (LLMs). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00418v1-abstract-full').style.display = 'none'; document.getElementById('2411.00418v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages,6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00064">arXiv:2411.00064</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00064">pdf</a>, <a href="https://arxiv.org/format/2411.00064">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The ISCSLP 2024 Conversational Voice Clone (CoVoC) Challenge: Tasks, Results and Findings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xia%2C+K">Kangxiang Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dake Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+L">Liumeng Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hanzhao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Zhao Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qingqing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+L">Lei Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+M">Minghui Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+P">Peng Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00064v1-abstract-short" style="display: inline;"> The ISCSLP 2024 Conversational Voice Clone (CoVoC) Challenge aims to benchmark and advance zero-shot spontaneous style voice cloning, particularly focusing on generating spontaneous behaviors in conversational speech. The challenge comprises two tracks: an unconstrained track without limitation on data and model usage, and a constrained track only allowing the use of constrained open-source datase&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00064v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00064v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00064v1-abstract-full" style="display: none;"> The ISCSLP 2024 Conversational Voice Clone (CoVoC) Challenge aims to benchmark and advance zero-shot spontaneous style voice cloning, particularly focusing on generating spontaneous behaviors in conversational speech. The challenge comprises two tracks: an unconstrained track without limitation on data and model usage, and a constrained track only allowing the use of constrained open-source datasets. A 100-hour high-quality conversational speech dataset is also made available with the challenge. This paper details the data, tracks, submitted systems, evaluation results, and findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00064v1-abstract-full').style.display = 'none'; document.getElementById('2411.00064v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24039">arXiv:2410.24039</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.24039">pdf</a>, <a href="https://arxiv.org/format/2410.24039">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Efficient Satellite-Ground Interconnection Design for Low-orbit Mega-Constellation Topology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenhao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jiazhi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Q">Quanwei Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+H">Handong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+K">Kun Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24039v1-abstract-short" style="display: inline;"> The low-orbit mega-constellation network (LMCN) is an important part of the space-air-ground integrated network system. An effective satellite-ground interconnection design can result in a stable constellation topology for LMCNs. A naive solution is accessing the satellite with the longest remaining service time (LRST), which is widely used in previous designs. The Coordinated Satellite-Ground Int&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24039v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24039v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24039v1-abstract-full" style="display: none;"> The low-orbit mega-constellation network (LMCN) is an important part of the space-air-ground integrated network system. An effective satellite-ground interconnection design can result in a stable constellation topology for LMCNs. A naive solution is accessing the satellite with the longest remaining service time (LRST), which is widely used in previous designs. The Coordinated Satellite-Ground Interconnecting (CSGI), the state-of-the-art algorithm, coordinates the establishment of ground-satellite links (GSLs). Compared with existing solutions, it reduces latency by 19% and jitter by 70% on average. However, CSGI only supports the scenario where terminals access only one satellite and cannot fully utilize the multi-access capabilities of terminals. Additionally, CSGI&#39;s high computational complexity poses deployment challenges. To overcome these problems, we propose the Classification-based Longest Remaining Service Time (C-LRST) algorithm. C-LRST supports the actual scenario with multi-access capabilities. It adds optional paths during routing with low computational complexity, improving end-to-end communications quality. We conduct our 1000s simulation from Brazil to Lithuania on the open-source platform Hypatia. Experiment results show that compared with CSGI, C-LRST reduces the latency and increases the throughput by approximately 60% and 40%, respectively. In addition, C-LRST&#39;s GSL switching number is 14, whereas CSGI is 23. C-LRST has better link stability than CSGI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24039v1-abstract-full').style.display = 'none'; document.getElementById('2410.24039v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24032">arXiv:2410.24032</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.24032">pdf</a>, <a href="https://arxiv.org/format/2410.24032">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Navigating the Unknown: A Chat-Based Collaborative Interface for Personalized Exploratory Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Y">Yingzhe Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+X">Xiaoting Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhiyang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Q">Qingwei Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dongmei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Rajmohan%2C+S">Saravan Rajmohan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24032v1-abstract-short" style="display: inline;"> The rise of large language models (LLMs) has revolutionized user interactions with knowledge-based systems, enabling chatbots to synthesize vast amounts of information and assist with complex, exploratory tasks. However, LLM-based chatbots often struggle to provide personalized support, particularly when users start with vague queries or lack sufficient contextual information. This paper introduce&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24032v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24032v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24032v1-abstract-full" style="display: none;"> The rise of large language models (LLMs) has revolutionized user interactions with knowledge-based systems, enabling chatbots to synthesize vast amounts of information and assist with complex, exploratory tasks. However, LLM-based chatbots often struggle to provide personalized support, particularly when users start with vague queries or lack sufficient contextual information. This paper introduces the Collaborative Assistant for Personalized Exploration (CARE), a system designed to enhance personalization in exploratory tasks by combining a multi-agent LLM framework with a structured user interface. CARE&#39;s interface consists of a Chat Panel, Solution Panel, and Needs Panel, enabling iterative query refinement and dynamic solution generation. The multi-agent framework collaborates to identify both explicit and implicit user needs, delivering tailored, actionable solutions. In a within-subject user study with 22 participants, CARE was consistently preferred over a baseline LLM chatbot, with users praising its ability to reduce cognitive load, inspire creativity, and provide more tailored solutions. Our findings highlight CARE&#39;s potential to transform LLM-based systems from passive information retrievers to proactive partners in personalized problem-solving and exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24032v1-abstract-full').style.display = 'none'; document.getElementById('2410.24032v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23074">arXiv:2410.23074</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23074">pdf</a>, <a href="https://arxiv.org/format/2410.23074">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-Programming Language Sandbox for LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dou%2C+S">Shihan Dou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiazheng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zang%2C+J">Jianxiang Zang</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+Y">Yunbo Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+W">Weikang Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+H">Haoxiang Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shichun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yuming Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xi%2C+Z">Zhiheng Xi</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+S">Shenxi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shaoqing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+M">Muling Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+C">Changze Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+L">Limao Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+W">Wenyu Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Weng%2C+R">Rongxiang Weng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+X">Xunliang Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yueming Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+M">Ming Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+R">Rui Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+T">Tao Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Y">Yixin Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Gui%2C+T">Tao Gui</a> , et al. (3 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23074v2-abstract-short" style="display: inline;"> We introduce MPLSandbox, an out-of-the-box multi-programming language sandbox designed to provide unified and comprehensive feedback from compiler and analysis tools for Large Language Models (LLMs). It can automatically identify the programming language of the code, compiling and executing it within an isolated sub-sandbox to ensure safety and stability. In addition, MPLSandbox also integrates bo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23074v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23074v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23074v2-abstract-full" style="display: none;"> We introduce MPLSandbox, an out-of-the-box multi-programming language sandbox designed to provide unified and comprehensive feedback from compiler and analysis tools for Large Language Models (LLMs). It can automatically identify the programming language of the code, compiling and executing it within an isolated sub-sandbox to ensure safety and stability. In addition, MPLSandbox also integrates both traditional and LLM-based code analysis tools, providing a comprehensive analysis of generated code. MPLSandbox can be effortlessly integrated into the training and deployment of LLMs to improve the quality and correctness of their generated code. It also helps researchers streamline their workflows for various LLM-based code-related tasks, reducing the development cost. To validate the effectiveness of MPLSandbox, we integrate it into training and deployment approaches, and also employ it to optimize workflows for a wide range of real-world code-related tasks. Our goal is to enhance researcher productivity on LLM-based code-related tasks by simplifying and automating workflows through delegation to MPLSandbox. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23074v2-abstract-full').style.display = 'none'; document.getElementById('2410.23074v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22837">arXiv:2410.22837</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22837">pdf</a>, <a href="https://arxiv.org/format/2410.22837">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3233/FAIA240524">10.3233/FAIA240524 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SFDFusion: An Efficient Spatial-Frequency Domain Fusion Network for Infrared and Visible Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+K">Kun Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qingle Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+M">Maoxun Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yitian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22837v1-abstract-short" style="display: inline;"> Infrared and visible image fusion aims to utilize the complementary information from two modalities to generate fused images with prominent targets and rich texture details. Most existing algorithms only perform pixel-level or feature-level fusion from different modalities in the spatial domain. They usually overlook the information in the frequency domain, and some of them suffer from inefficienc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22837v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22837v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22837v1-abstract-full" style="display: none;"> Infrared and visible image fusion aims to utilize the complementary information from two modalities to generate fused images with prominent targets and rich texture details. Most existing algorithms only perform pixel-level or feature-level fusion from different modalities in the spatial domain. They usually overlook the information in the frequency domain, and some of them suffer from inefficiency due to excessively complex structures. To tackle these challenges, this paper proposes an efficient Spatial-Frequency Domain Fusion (SFDFusion) network for infrared and visible image fusion. First, we propose a Dual-Modality Refinement Module (DMRM) to extract complementary information. This module extracts useful information from both the infrared and visible modalities in the spatial domain and enhances fine-grained spatial details. Next, to introduce frequency domain information, we construct a Frequency Domain Fusion Module (FDFM) that transforms the spatial domain to the frequency domain through Fast Fourier Transform (FFT) and then integrates frequency domain information. Additionally, we design a frequency domain fusion loss to provide guidance for the fusion process. Extensive experiments on public datasets demonstrate that our method produces fused images with significant advantages in various fusion metrics and visual effects. Furthermore, our method demonstrates high efficiency in image fusion and good performance on downstream detection tasks, thereby satisfying the real-time demands of advanced visual tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22837v1-abstract-full').style.display = 'none'; document.getElementById('2410.22837v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accept in ECAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22313">arXiv:2410.22313</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22313">pdf</a>, <a href="https://arxiv.org/format/2410.22313">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Senna: Bridging Large Vision-Language Models and End-to-End Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shaoyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+B">Bencheng Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xingyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+W">Wei Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Chang Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinggang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22313v1-abstract-short" style="display: inline;"> End-to-end autonomous driving demonstrates strong planning capabilities with large-scale data but still struggles in complex, rare scenarios due to limited commonsense. In contrast, Large Vision-Language Models (LVLMs) excel in scene understanding and reasoning. The path forward lies in merging the strengths of both approaches. Previous methods using LVLMs to predict trajectories or control signal&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22313v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22313v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22313v1-abstract-full" style="display: none;"> End-to-end autonomous driving demonstrates strong planning capabilities with large-scale data but still struggles in complex, rare scenarios due to limited commonsense. In contrast, Large Vision-Language Models (LVLMs) excel in scene understanding and reasoning. The path forward lies in merging the strengths of both approaches. Previous methods using LVLMs to predict trajectories or control signals yield suboptimal results, as LVLMs are not well-suited for precise numerical predictions. This paper presents Senna, an autonomous driving system combining an LVLM (Senna-VLM) with an end-to-end model (Senna-E2E). Senna decouples high-level planning from low-level trajectory prediction. Senna-VLM generates planning decisions in natural language, while Senna-E2E predicts precise trajectories. Senna-VLM utilizes a multi-image encoding approach and multi-view prompts for efficient scene understanding. Besides, we introduce planning-oriented QAs alongside a three-stage training strategy, which enhances Senna-VLM&#39;s planning performance while preserving commonsense. Extensive experiments on two datasets show that Senna achieves state-of-the-art planning performance. Notably, with pre-training on a large-scale dataset DriveX and fine-tuning on nuScenes, Senna significantly reduces average planning error by 27.12% and collision rate by 33.33% over model without pre-training. We believe Senna&#39;s cross-scenario generalization and transferability are essential for achieving fully autonomous driving. Code and models will be released at https://github.com/hustvl/Senna. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22313v1-abstract-full').style.display = 'none'; document.getElementById('2410.22313v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://github.com/hustvl/Senna</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21331">arXiv:2410.21331</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21331">pdf</a>, <a href="https://arxiv.org/format/2410.21331">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Beyond Interpretability: The Gains of Feature Monosemanticity on Model Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yifei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+J">Jingyi Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+X">Xiang Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+Q">Qi Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Jegelka%2C+S">Stefanie Jegelka</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yisen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21331v1-abstract-short" style="display: inline;"> Deep learning models often suffer from a lack of interpretability due to polysemanticity, where individual neurons are activated by multiple unrelated semantics, resulting in unclear attributions of model behavior. Recent advances in monosemanticity, where neurons correspond to consistent and distinct semantics, have significantly improved interpretability but are commonly believed to compromise a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21331v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21331v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21331v1-abstract-full" style="display: none;"> Deep learning models often suffer from a lack of interpretability due to polysemanticity, where individual neurons are activated by multiple unrelated semantics, resulting in unclear attributions of model behavior. Recent advances in monosemanticity, where neurons correspond to consistent and distinct semantics, have significantly improved interpretability but are commonly believed to compromise accuracy. In this work, we challenge the prevailing belief of the accuracy-interpretability tradeoff, showing that monosemantic features not only enhance interpretability but also bring concrete gains in model performance. Across multiple robust learning scenarios-including input and label noise, few-shot learning, and out-of-domain generalization-our results show that models leveraging monosemantic features significantly outperform those relying on polysemantic features. Furthermore, we provide empirical and theoretical understandings on the robustness gains of feature monosemanticity. Our preliminary analysis suggests that monosemanticity, by promoting better separation of feature representations, leads to more robust decision boundaries. This diverse evidence highlights the generality of monosemanticity in improving model robustness. As a first step in this new direction, we embark on exploring the learning benefits of monosemanticity beyond interpretability, supporting the long-standing hypothesis of linking interpretability and robustness. Code is available at \url{https://github.com/PKU-ML/Beyond_Interpretability}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21331v1-abstract-full').style.display = 'none'; document.getElementById('2410.21331v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+Q&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10