CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 3,774 results for author: <span class="mathjax">Zhang, L</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Zhang%2C+L">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, L"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+L&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, L"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14347">arXiv:2411.14347</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14347">pdf</a>, <a href="https://arxiv.org/format/2411.14347">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DINO-X: A Unified Vision Model for Open-World Object Detection and Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+T">Tianhe Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yihao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Q">Qing Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Z">Zhaoyang Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Y">Yuda Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenlong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Z">Zhengyu Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Junyi Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+X">Xiaoke Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Z">Zhuheng Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yuhong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Hongjie Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+H">Han Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shilong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+F">Feng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+K">Kent Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14347v1-abstract-short" style="display: inline;"> In this paper, we introduce DINO-X, which is a unified object-centric vision model developed by IDEA Research with the best open-world object detection performance to date. DINO-X employs the same Transformer-based encoder-decoder architecture as Grounding DINO 1.5 to pursue an object-level representation for open-world object understanding. To make long-tailed object detection easy, DINO-X extend&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14347v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14347v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14347v1-abstract-full" style="display: none;"> In this paper, we introduce DINO-X, which is a unified object-centric vision model developed by IDEA Research with the best open-world object detection performance to date. DINO-X employs the same Transformer-based encoder-decoder architecture as Grounding DINO 1.5 to pursue an object-level representation for open-world object understanding. To make long-tailed object detection easy, DINO-X extends its input options to support text prompt, visual prompt, and customized prompt. With such flexible prompt options, we develop a universal object prompt to support prompt-free open-world detection, making it possible to detect anything in an image without requiring users to provide any prompt. To enhance the model&#39;s core grounding capability, we have constructed a large-scale dataset with over 100 million high-quality grounding samples, referred to as Grounding-100M, for advancing the model&#39;s open-vocabulary detection performance. Pre-training on such a large-scale grounding dataset leads to a foundational object-level representation, which enables DINO-X to integrate multiple perception heads to simultaneously support multiple object perception and understanding tasks, including detection, segmentation, pose estimation, object captioning, object-based QA, etc. Experimental results demonstrate the superior performance of DINO-X. Specifically, the DINO-X Pro model achieves 56.0 AP, 59.8 AP, and 52.4 AP on the COCO, LVIS-minival, and LVIS-val zero-shot object detection benchmarks, respectively. Notably, it scores 63.3 AP and 56.5 AP on the rare classes of LVIS-minival and LVIS-val benchmarks, both improving the previous SOTA performance by 5.8 AP. Such a result underscores its significantly improved capacity for recognizing long-tailed objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14347v1-abstract-full').style.display = 'none'; document.getElementById('2411.14347v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13842">arXiv:2411.13842</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13842">pdf</a>, <a href="https://arxiv.org/format/2411.13842">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Detecting Human Artifacts from Text-to-Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kaihong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lingzhi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13842v1-abstract-short" style="display: inline;"> Despite recent advancements, text-to-image generation models often produce images containing artifacts, especially in human figures. These artifacts appear as poorly generated human bodies, including distorted, missing, or extra body parts, leading to visual inconsistencies with typical human anatomy and greatly impairing overall fidelity. In this study, we address this challenge by curating Human&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13842v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13842v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13842v1-abstract-full" style="display: none;"> Despite recent advancements, text-to-image generation models often produce images containing artifacts, especially in human figures. These artifacts appear as poorly generated human bodies, including distorted, missing, or extra body parts, leading to visual inconsistencies with typical human anatomy and greatly impairing overall fidelity. In this study, we address this challenge by curating Human Artifact Dataset (HAD), the first large-scale dataset specifically designed to identify and localize human artifacts. HAD comprises over 37,000 images generated by several popular text-to-image models, annotated for human artifact localization. Using this dataset, we train the Human Artifact Detection Models (HADM), which can identify diverse artifact types across multiple generative domains and demonstrate strong generalization, even on images from unseen generators. Additionally, to further improve generators&#39; perception of human structural coherence, we use the predictions from our HADM as feedback for diffusion model finetuning. Our experiments confirm a reduction in human artifacts in the resulting model. Furthermore, we showcase a novel application of our HADM in an iterative inpainting framework to correct human artifacts in arbitrary images directly, demonstrating its utility in improving image quality. Our dataset and detection models are available at: \url{https://github.com/wangkaihong/HADM}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13842v1-abstract-full').style.display = 'none'; document.getElementById('2411.13842v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13682">arXiv:2411.13682</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13682">pdf</a>, <a href="https://arxiv.org/format/2411.13682">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Differentially Private Learning Beyond the Classical Dimensionality Regime </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dwork%2C+C">Cynthia Dwork</a>, <a href="/search/cs?searchtype=author&amp;query=Tankala%2C+P">Pranay Tankala</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Linjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13682v1-abstract-short" style="display: inline;"> We initiate the study of differentially private learning in the proportional dimensionality regime, in which the number of data samples $n$ and problem dimension $d$ approach infinity at rates proportional to one another, meaning that $d / n \to 未$ as $n \to \infty$ for an arbitrary, given constant $未\in (0, \infty)$. This setting is significantly more challenging than that of all prior theoretica&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13682v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13682v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13682v1-abstract-full" style="display: none;"> We initiate the study of differentially private learning in the proportional dimensionality regime, in which the number of data samples $n$ and problem dimension $d$ approach infinity at rates proportional to one another, meaning that $d / n \to 未$ as $n \to \infty$ for an arbitrary, given constant $未\in (0, \infty)$. This setting is significantly more challenging than that of all prior theoretical work in high-dimensional differentially private learning, which, despite the name, has assumed that $未= 0$ or is sufficiently small for problems of sample complexity $O(d)$, a regime typically considered &#34;low-dimensional&#34; or &#34;classical&#34; by modern standards in high-dimensional statistics. We provide sharp theoretical estimates of the error of several well-studied differentially private algorithms for robust linear regression and logistic regression, including output perturbation, objective perturbation, and noisy stochastic gradient descent, in the proportional dimensionality regime. The $1 + o(1)$ factor precision of our error estimates enables a far more nuanced understanding of the price of privacy of these algorithms than that afforded by existing, coarser analyses, which are essentially vacuous in the regime we consider. We incorporate several probabilistic tools that have not previously been used to analyze differentially private learning algorithms, such as a modern Gaussian comparison inequality and recent universality laws with origins in statistical physics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13682v1-abstract-full').style.display = 'none'; document.getElementById('2411.13682v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13383">arXiv:2411.13383</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13383">pdf</a>, <a href="https://arxiv.org/format/2411.13383">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Diffusion Compression for Real-World Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Gehui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+R">Rongyuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xindong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13383v1-abstract-short" style="display: inline;"> Real-world image super-resolution (Real-ISR) aims to reconstruct high-resolution images from low-resolution inputs degraded by complex, unknown processes. While many Stable Diffusion (SD)-based Real-ISR methods have achieved remarkable success, their slow, multi-step inference hinders practical deployment. Recent SD-based one-step networks like OSEDiff and S3Diff alleviate this issue but still inc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13383v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13383v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13383v1-abstract-full" style="display: none;"> Real-world image super-resolution (Real-ISR) aims to reconstruct high-resolution images from low-resolution inputs degraded by complex, unknown processes. While many Stable Diffusion (SD)-based Real-ISR methods have achieved remarkable success, their slow, multi-step inference hinders practical deployment. Recent SD-based one-step networks like OSEDiff and S3Diff alleviate this issue but still incur high computational costs due to their reliance on large pretrained SD models. This paper proposes a novel Real-ISR method, AdcSR, by distilling the one-step diffusion network OSEDiff into a streamlined diffusion-GAN model under our Adversarial Diffusion Compression (ADC) framework. We meticulously examine the modules of OSEDiff, categorizing them into two types: (1) Removable (VAE encoder, prompt extractor, text encoder, etc.) and (2) Prunable (denoising UNet and VAE decoder). Since direct removal and pruning can degrade the model&#39;s generation capability, we pretrain our pruned VAE decoder to restore its ability to decode images and employ adversarial distillation to compensate for performance loss. This ADC-based diffusion-GAN hybrid design effectively reduces complexity by 73% in inference time, 78% in computation, and 74% in parameters, while preserving the model&#39;s generation capability. Experiments manifest that our proposed AdcSR achieves competitive recovery quality on both synthetic and real-world datasets, offering up to 9.3$\times$ speedup over previous one-step diffusion-based methods. Code and models will be made available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13383v1-abstract-full').style.display = 'none'; document.getElementById('2411.13383v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13069">arXiv:2411.13069</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13069">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automatic marker-free registration based on similar tetrahedras for single-tree point clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+J">Jing Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Pei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hanlong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yuhan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Y">Yuhang Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Wenxin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Mingtai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lingyun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13069v1-abstract-short" style="display: inline;"> In recent years, terrestrial laser scanning technology has been widely used to collect tree point cloud data, aiding in measurements of diameter at breast height, biomass, and other forestry survey data. Since a single scan from terrestrial laser systems captures data from only one angle, multiple scans must be registered and fused to obtain complete tree point cloud data. This paper proposes a ma&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13069v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13069v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13069v1-abstract-full" style="display: none;"> In recent years, terrestrial laser scanning technology has been widely used to collect tree point cloud data, aiding in measurements of diameter at breast height, biomass, and other forestry survey data. Since a single scan from terrestrial laser systems captures data from only one angle, multiple scans must be registered and fused to obtain complete tree point cloud data. This paper proposes a marker-free automatic registration method for single-tree point clouds based on similar tetrahedras. First, two point clouds from two scans of the same tree are used to generate tree skeletons, and key point sets are constructed from these skeletons. Tetrahedra are then filtered and matched according to similarity principles, with the vertices of these two matched tetrahedras selected as matching point pairs, thus completing the coarse registration of the point clouds from the two scans. Subsequently, the ICP method is applied to the coarse-registered leaf point clouds to obtain fine registration parameters, completing the precise registration of the two tree point clouds. Experiments were conducted using terrestrial laser scanning data from eight trees, each from different species and with varying shapes. The proposed method was evaluated using RMSE and Hausdorff distance, compared against the traditional ICP and NDT methods. The experimental results demonstrate that the proposed method significantly outperforms both ICP and NDT in registration accuracy, achieving speeds up to 593 times and 113 times faster than ICP and NDT, respectively. In summary, the proposed method shows good robustness in single-tree point cloud registration, with significant advantages in accuracy and speed compared to traditional ICP and NDT methods, indicating excellent application prospects in practical registration scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13069v1-abstract-full').style.display = 'none'; document.getElementById('2411.13069v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">remote sensing; terrestrial lidar; multi-scan cloud registration</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12635">arXiv:2411.12635</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12635">pdf</a>, <a href="https://arxiv.org/format/2411.12635">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> M3D: Dual-Stream Selective State Spaces and Depth-Driven Framework for High-Fidelity Single-View 3D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Luoxi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Shrestha%2C+P">Pragyan Shrestha</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+C">Chun Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Kitahara%2C+I">Itaru Kitahara</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12635v2-abstract-short" style="display: inline;"> The precise reconstruction of 3D objects from a single RGB image in complex scenes presents a critical challenge in virtual reality, autonomous driving, and robotics. Existing neural implicit 3D representation methods face significant difficulties in balancing the extraction of global and local features, particularly in diverse and complex environments, leading to insufficient reconstruction preci&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12635v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12635v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12635v2-abstract-full" style="display: none;"> The precise reconstruction of 3D objects from a single RGB image in complex scenes presents a critical challenge in virtual reality, autonomous driving, and robotics. Existing neural implicit 3D representation methods face significant difficulties in balancing the extraction of global and local features, particularly in diverse and complex environments, leading to insufficient reconstruction precision and quality. We propose M3D, a novel single-view 3D reconstruction framework, to tackle these challenges. This framework adopts a dual-stream feature extraction strategy based on Selective State Spaces to effectively balance the extraction of global and local features, thereby improving scene comprehension and representation precision. Additionally, a parallel branch extracts depth information, effectively integrating visual and geometric features to enhance reconstruction quality and preserve intricate details. Experimental results indicate that the fusion of multi-scale features with depth information via the dual-branch feature extraction significantly boosts geometric consistency and fidelity, achieving state-of-the-art reconstruction performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12635v2-abstract-full').style.display = 'none'; document.getElementById('2411.12635v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.3.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12359">arXiv:2411.12359</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12359">pdf</a>, <a href="https://arxiv.org/format/2411.12359">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> TactV: A Class of Hybrid Terrestrial/Aerial Coaxial Tilt-Rotor Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yifei Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yimin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lixian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Y">Yihang Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12359v1-abstract-short" style="display: inline;"> To enhance the obstacle-crossing and endurance capabilities of vehicles operating in complex environments, this paper presents the design of a hybrid terrestrial/aerial coaxial tilt-rotor vehicle, TactV, which integrates advantages such as lightweight construction and high maneuverability. Unlike existing tandem dual-rotor vehicles, TactV employs a tiltable coaxial dual-rotor design and features a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12359v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12359v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12359v1-abstract-full" style="display: none;"> To enhance the obstacle-crossing and endurance capabilities of vehicles operating in complex environments, this paper presents the design of a hybrid terrestrial/aerial coaxial tilt-rotor vehicle, TactV, which integrates advantages such as lightweight construction and high maneuverability. Unlike existing tandem dual-rotor vehicles, TactV employs a tiltable coaxial dual-rotor design and features a spherical cage structure that encases the body, allowing for omnidirectional movement while further reducing its overall dimensions. To enable TactV to maneuver flexibly in aerial, planar, and inclined surfaces, we established corresponding dynamic and control models for each mode. Additionally, we leveraged TactV&#39;s tiltable center of gravity to design energy-saving and high-mobility modes for ground operations, thereby further enhancing its endurance. Experimental designs for both aerial and ground tests corroborated the superiority of TactV&#39;s movement capabilities and control strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12359v1-abstract-full').style.display = 'none'; document.getElementById('2411.12359v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12156">arXiv:2411.12156</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12156">pdf</a>, <a href="https://arxiv.org/format/2411.12156">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HNCSE: Advancing Sentence Embeddings via Hybrid Contrastive Learning with Hard Negatives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenxiao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zihong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chaozhuo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+Z">Zijin Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Jianfeng Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiquan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Litian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+F">Feiran Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12156v1-abstract-short" style="display: inline;"> Unsupervised sentence representation learning remains a critical challenge in modern natural language processing (NLP) research. Recently, contrastive learning techniques have achieved significant success in addressing this issue by effectively capturing textual semantics. Many such approaches prioritize the optimization using negative samples. In fields such as computer vision, hard negative samp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12156v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12156v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12156v1-abstract-full" style="display: none;"> Unsupervised sentence representation learning remains a critical challenge in modern natural language processing (NLP) research. Recently, contrastive learning techniques have achieved significant success in addressing this issue by effectively capturing textual semantics. Many such approaches prioritize the optimization using negative samples. In fields such as computer vision, hard negative samples (samples that are close to the decision boundary and thus more difficult to distinguish) have been shown to enhance representation learning. However, adapting hard negatives to contrastive sentence learning is complex due to the intricate syntactic and semantic details of text. To address this problem, we propose HNCSE, a novel contrastive learning framework that extends the leading SimCSE approach. The hallmark of HNCSE is its innovative use of hard negative samples to enhance the learning of both positive and negative samples, thereby achieving a deeper semantic understanding. Empirical tests on semantic textual similarity and transfer task datasets validate the superiority of HNCSE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12156v1-abstract-full').style.display = 'none'; document.getElementById('2411.12156v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11940">arXiv:2411.11940</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11940">pdf</a>, <a href="https://arxiv.org/format/2411.11940">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Introducing Milabench: Benchmarking Accelerators for AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Delaunay%2C+P">Pierre Delaunay</a>, <a href="/search/cs?searchtype=author&amp;query=Bouthillier%2C+X">Xavier Bouthillier</a>, <a href="/search/cs?searchtype=author&amp;query=Breuleux%2C+O">Olivier Breuleux</a>, <a href="/search/cs?searchtype=author&amp;query=Ortiz-Gagn%C3%A9%2C+S">Satya Ortiz-Gagn茅</a>, <a href="/search/cs?searchtype=author&amp;query=Bilaniuk%2C+O">Olexa Bilaniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Normandin%2C+F">Fabrice Normandin</a>, <a href="/search/cs?searchtype=author&amp;query=Bergeron%2C+A">Arnaud Bergeron</a>, <a href="/search/cs?searchtype=author&amp;query=Carrez%2C+B">Bruno Carrez</a>, <a href="/search/cs?searchtype=author&amp;query=Alain%2C+G">Guillaume Alain</a>, <a href="/search/cs?searchtype=author&amp;query=Blanc%2C+S">Soline Blanc</a>, <a href="/search/cs?searchtype=author&amp;query=Osterrath%2C+F">Fr茅d茅ric Osterrath</a>, <a href="/search/cs?searchtype=author&amp;query=Viviano%2C+J">Joseph Viviano</a>, <a href="/search/cs?searchtype=author&amp;query=Patil%2C+R+C+D">Roger Creus-Castanyer Darshan Patil</a>, <a href="/search/cs?searchtype=author&amp;query=Awal%2C+R">Rabiul Awal</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Le Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11940v1-abstract-short" style="display: inline;"> AI workloads, particularly those driven by deep learning, are introducing novel usage patterns to high-performance computing (HPC) systems that are not comprehensively captured by standard HPC benchmarks. As one of the largest academic research centers dedicated to deep learning, Mila identified the need to develop a custom benchmarking suite to address the diverse requirements of its community, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11940v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11940v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11940v1-abstract-full" style="display: none;"> AI workloads, particularly those driven by deep learning, are introducing novel usage patterns to high-performance computing (HPC) systems that are not comprehensively captured by standard HPC benchmarks. As one of the largest academic research centers dedicated to deep learning, Mila identified the need to develop a custom benchmarking suite to address the diverse requirements of its community, which consists of over 1,000 researchers. This report introduces Milabench, the resulting benchmarking suite. Its design was informed by an extensive literature review encompassing 867 papers, as well as surveys conducted with Mila researchers. This rigorous process led to the selection of 26 primary benchmarks tailored for procurement evaluations, alongside 16 optional benchmarks for in-depth analysis. We detail the design methodology, the structure of the benchmarking suite, and provide performance evaluations using GPUs from NVIDIA, AMD, and Intel. The Milabench suite is open source and can be accessed at github.com/mila-iqia/milabench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11940v1-abstract-full').style.display = 'none'; document.getElementById('2411.11940v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11875">arXiv:2411.11875</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11875">pdf</a>, <a href="https://arxiv.org/format/2411.11875">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Exploring Optimal Transport-Based Multi-Grained Alignments for Text-Molecule Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Min%2C+Z">Zijun Min</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Bingshuai Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+J">Jia Song</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+J">Jinsong Su</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+S">Song He</a>, <a href="/search/cs?searchtype=author&amp;query=Bo%2C+X">Xiaochen Bo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11875v1-abstract-short" style="display: inline;"> The field of bioinformatics has seen significant progress, making the cross-modal text-molecule retrieval task increasingly vital. This task focuses on accurately retrieving molecule structures based on textual descriptions, by effectively aligning textual descriptions and molecules to assist researchers in identifying suitable molecular candidates. However, many existing approaches overlook the d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11875v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11875v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11875v1-abstract-full" style="display: none;"> The field of bioinformatics has seen significant progress, making the cross-modal text-molecule retrieval task increasingly vital. This task focuses on accurately retrieving molecule structures based on textual descriptions, by effectively aligning textual descriptions and molecules to assist researchers in identifying suitable molecular candidates. However, many existing approaches overlook the details inherent in molecule sub-structures. In this work, we introduce the Optimal TRansport-based Multi-grained Alignments model (ORMA), a novel approach that facilitates multi-grained alignments between textual descriptions and molecules. Our model features a text encoder and a molecule encoder. The text encoder processes textual descriptions to generate both token-level and sentence-level representations, while molecules are modeled as hierarchical heterogeneous graphs, encompassing atom, motif, and molecule nodes to extract representations at these three levels. A key innovation in ORMA is the application of Optimal Transport (OT) to align tokens with motifs, creating multi-token representations that integrate multiple token alignments with their corresponding motifs. Additionally, we employ contrastive learning to refine cross-modal alignments at three distinct scales: token-atom, multitoken-motif, and sentence-molecule, ensuring that the similarities between correctly matched text-molecule pairs are maximized while those of unmatched pairs are minimized. To our knowledge, this is the first attempt to explore alignments at both the motif and multi-token levels. Experimental results on the ChEBI-20 and PCdes datasets demonstrate that ORMA significantly outperforms existing state-of-the-art (SOTA) models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11875v1-abstract-full').style.display = 'none'; document.getElementById('2411.11875v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">BIBM 2024 Regular Paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11683">arXiv:2411.11683</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11683">pdf</a>, <a href="https://arxiv.org/format/2411.11683">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TrojanRobot: Backdoor Attacks Against Robotic Manipulation in the Physical World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xianlong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+H">Hewen Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hangtao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Minghui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengshan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Ziqi Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+L">Lulu Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+P">Peijin Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yichen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+W">Wei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A">Aishan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L+Y">Leo Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11683v1-abstract-short" style="display: inline;"> Robotic manipulation refers to the autonomous handling and interaction of robots with objects using advanced techniques in robotics and artificial intelligence. The advent of powerful tools such as large language models (LLMs) and large vision-language models (LVLMs) has significantly enhanced the capabilities of these robots in environmental perception and decision-making. However, the introducti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11683v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11683v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11683v1-abstract-full" style="display: none;"> Robotic manipulation refers to the autonomous handling and interaction of robots with objects using advanced techniques in robotics and artificial intelligence. The advent of powerful tools such as large language models (LLMs) and large vision-language models (LVLMs) has significantly enhanced the capabilities of these robots in environmental perception and decision-making. However, the introduction of these intelligent agents has led to security threats such as jailbreak attacks and adversarial attacks. In this research, we take a further step by proposing a backdoor attack specifically targeting robotic manipulation and, for the first time, implementing backdoor attack in the physical world. By embedding a backdoor visual language model into the visual perception module within the robotic system, we successfully mislead the robotic arm&#39;s operation in the physical world, given the presence of common items as triggers. Experimental evaluations in the physical world demonstrate the effectiveness of the proposed backdoor attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11683v1-abstract-full').style.display = 'none'; document.getElementById('2411.11683v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Initial version with preliminary results. We welcome any feedback or suggestions</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11539">arXiv:2411.11539</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11539">pdf</a>, <a href="https://arxiv.org/ps/2411.11539">ps</a>, <a href="https://arxiv.org/format/2411.11539">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Channel Capacity-Aware Distributed Encoding for Multi-View Sensing and Edge Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+M">Mingjie Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+G">Guangming Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Dongzhu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+K">Kaibin Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11539v1-abstract-short" style="display: inline;"> Integrated sensing and communication (ISAC) unifies wireless communication and sensing by sharing spectrum and hardware, which often incurs trade-offs between two functions due to limited resources. However, this paper shifts focus to exploring the synergy between communication and sensing, using WiFi sensing as an exemplary scenario where communication signals are repurposed to probe the environm&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11539v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11539v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11539v1-abstract-full" style="display: none;"> Integrated sensing and communication (ISAC) unifies wireless communication and sensing by sharing spectrum and hardware, which often incurs trade-offs between two functions due to limited resources. However, this paper shifts focus to exploring the synergy between communication and sensing, using WiFi sensing as an exemplary scenario where communication signals are repurposed to probe the environment without dedicated sensing waveforms, followed by data uploading to the edge server for inference. While increased device participation enhances multi-view sensing data, it also imposes significant communication overhead between devices and the edge server. To address this challenge, we aim to maximize the sensing task performance, measured by mutual information, under the channel capacity constraint. The information-theoretic optimization problem is solved by the proposed ADE-MI, a novel framework that employs a two-stage optimization two-stage optimization approach: (1) adaptive distributed encoding (ADE) at the device, which ensures transmitted bits are most relevant to sensing tasks, and (2) multi-view Inference (MI) at the edge server, which orchestrates multi-view data from distributed devices. Our experimental results highlight the synergy between communication and sensing, showing that more frequent communication from WiFi access points to edge devices improves sensing inference accuracy. The proposed ADE-MI achieves 92\% recognition accuracy with over $10^4$-fold reduction in latency compared to schemes with raw data communication, achieving both high sensing inference accuracy and low communication latency simultaneously. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11539v1-abstract-full').style.display = 'none'; document.getElementById('2411.11539v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11477">arXiv:2411.11477</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11477">pdf</a>, <a href="https://arxiv.org/format/2411.11477">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SL-YOLO: A Stronger and Lighter Drone Target Detection Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+D">Defan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Luchan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11477v1-abstract-short" style="display: inline;"> Detecting small objects in complex scenes, such as those captured by drones, is a daunting challenge due to the difficulty in capturing the complex features of small targets. While the YOLO family has achieved great success in large target detection, its performance is less than satisfactory when faced with small targets. Because of this, this paper proposes a revolutionary model SL-YOLO (Stronger&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11477v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11477v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11477v1-abstract-full" style="display: none;"> Detecting small objects in complex scenes, such as those captured by drones, is a daunting challenge due to the difficulty in capturing the complex features of small targets. While the YOLO family has achieved great success in large target detection, its performance is less than satisfactory when faced with small targets. Because of this, this paper proposes a revolutionary model SL-YOLO (Stronger and Lighter YOLO) that aims to break the bottleneck of small target detection. We propose the Hierarchical Extended Path Aggregation Network (HEPAN), a pioneering cross-scale feature fusion method that can ensure unparalleled detection accuracy even in the most challenging environments. At the same time, without sacrificing detection capabilities, we design the C2fDCB lightweight module and add the SCDown downsampling module to greatly reduce the model&#39;s parameters and computational complexity. Our experimental results on the VisDrone2019 dataset reveal a significant improvement in performance, with mAP@0.5 jumping from 43.0% to 46.9% and mAP@0.5:0.95 increasing from 26.0% to 28.9%. At the same time, the model parameters are reduced from 11.1M to 9.6M, and the FPS can reach 132, making it an ideal solution for real-time small object detection in resource-constrained environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11477v1-abstract-full').style.display = 'none'; document.getElementById('2411.11477v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11448">arXiv:2411.11448</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11448">pdf</a>, <a href="https://arxiv.org/format/2411.11448">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Inflexibility of Adaptive Embedding in Traffic Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongjun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiyuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lingyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+R">Renhe Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+X">Xuan Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11448v1-abstract-short" style="display: inline;"> Spatiotemporal Graph Neural Networks (ST-GNNs) and Transformers have shown significant promise in traffic forecasting by effectively modeling temporal and spatial correlations. However, rapid urbanization in recent years has led to dynamic shifts in traffic patterns and travel demand, posing major challenges for accurate long-term traffic prediction. The generalization capability of ST-GNNs in ext&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11448v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11448v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11448v1-abstract-full" style="display: none;"> Spatiotemporal Graph Neural Networks (ST-GNNs) and Transformers have shown significant promise in traffic forecasting by effectively modeling temporal and spatial correlations. However, rapid urbanization in recent years has led to dynamic shifts in traffic patterns and travel demand, posing major challenges for accurate long-term traffic prediction. The generalization capability of ST-GNNs in extended temporal scenarios and cross-city applications remains largely unexplored. In this study, we evaluate state-of-the-art models on an extended traffic benchmark and observe substantial performance degradation in existing ST-GNNs over time, which we attribute to their limited inductive capabilities. Our analysis reveals that this degradation stems from an inability to adapt to evolving spatial relationships within urban environments. To address this limitation, we reconsider the design of adaptive embeddings and propose a Principal Component Analysis (PCA) embedding approach that enables models to adapt to new scenarios without retraining. We incorporate PCA embeddings into existing ST-GNN and Transformer architectures, achieving marked improvements in performance. Notably, PCA embeddings allow for flexibility in graph structures between training and testing, enabling models trained on one city to perform zero-shot predictions on other cities. This adaptability demonstrates the potential of PCA embeddings in enhancing the robustness and generalization of spatiotemporal models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11448v1-abstract-full').style.display = 'none'; document.getElementById('2411.11448v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11098">arXiv:2411.11098</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11098">pdf</a>, <a href="https://arxiv.org/format/2411.11098">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MolParser: End-to-end Visual Recognition of Molecule Structures in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fang%2C+X">Xi Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiankun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+X">Xiaochen Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shangqian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shuwen Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+L">Lin Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Linfeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ke%2C+G">Guolin Ke</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11098v1-abstract-short" style="display: inline;"> In recent decades, chemistry publications and patents have increased rapidly. A significant portion of key information is embedded in molecular structure figures, complicating large-scale literature searches and limiting the application of large language models in fields such as biology, chemistry, and pharmaceuticals. The automatic extraction of precise chemical structures is of critical importan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11098v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11098v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11098v1-abstract-full" style="display: none;"> In recent decades, chemistry publications and patents have increased rapidly. A significant portion of key information is embedded in molecular structure figures, complicating large-scale literature searches and limiting the application of large language models in fields such as biology, chemistry, and pharmaceuticals. The automatic extraction of precise chemical structures is of critical importance. However, the presence of numerous Markush structures in real-world documents, along with variations in molecular image quality, drawing styles, and noise, significantly limits the performance of existing optical chemical structure recognition (OCSR) methods. We present MolParser, a novel end-to-end OCSR method that efficiently and accurately recognizes chemical structures from real-world documents, including difficult Markush structure. We use a extended SMILES encoding rule to annotate our training dataset. Under this rule, we build MolParser-7M, the largest annotated molecular image dataset to our knowledge. While utilizing a large amount of synthetic data, we employed active learning methods to incorporate substantial in-the-wild data, specifically samples cropped from real patents and scientific literature, into the training process. We trained an end-to-end molecular image captioning model, MolParser, using a curriculum learning approach. MolParser significantly outperforms classical and learning-based methods across most scenarios, with potential for broader downstream applications. The dataset is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11098v1-abstract-full').style.display = 'none'; document.getElementById('2411.11098v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10803">arXiv:2411.10803</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10803">pdf</a>, <a href="https://arxiv.org/format/2411.10803">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Stage Vision Token Dropping: Towards Efficient Multimodal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Ting Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+L">Liangtao Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+R">Richang Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yue Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+Q">Quanjun Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Linfeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10803v1-abstract-short" style="display: inline;"> The vision tokens in multimodal large language models usually exhibit significant spatial and temporal redundancy and take up most of the input tokens, which harms their inference efficiency. To solve this problem, some recent works were introduced to drop the unimportant tokens during inference where the importance of each token is decided only by the information in either the vision encoding sta&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10803v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10803v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10803v1-abstract-full" style="display: none;"> The vision tokens in multimodal large language models usually exhibit significant spatial and temporal redundancy and take up most of the input tokens, which harms their inference efficiency. To solve this problem, some recent works were introduced to drop the unimportant tokens during inference where the importance of each token is decided only by the information in either the vision encoding stage or the prefilling stage. In this paper, we propose Multi-stage Token Dropping (MustDrop) to measure the importance of each token from the whole lifecycle, including the vision encoding stage, prefilling stage, and decoding stage. Concretely, in the visual encoding stage, MustDrop merges spatially adjacent tokens with high similarity, and establishes a key token set to retain the most vision-critical tokens, preventing them from being discarded in later stages. In the prefilling stage, MustDrop further compresses vision tokens by the guidance of text semantics, with a dual-attention filtering strategy. In the decoding stage, an output-aware cache policy is proposed to further reduce the size of the KV cache. By leveraging tailored strategies in the multi-stage process, MustDrop can more precisely recognize the important and redundant tokens, thus achieving an optimal balance between performance and efficiency. For instance, MustDrop reduces about 88.5\% FLOPs on LLaVA with a compression ratio of 92.2\% while maintaining comparable accuracy. Our codes are available at \url{https://github.com/liuting20/MustDrop}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10803v1-abstract-full').style.display = 'none'; document.getElementById('2411.10803v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10546">arXiv:2411.10546</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10546">pdf</a>, <a href="https://arxiv.org/format/2411.10546">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> The Oxford Spires Dataset: Benchmarking Large-Scale LiDAR-Visual Localisation, Reconstruction and Radiance Field Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tao%2C+Y">Yifu Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%C3%B1oz-Ba%C3%B1%C3%B3n%2C+M+%C3%81">Miguel 脕ngel Mu帽oz-Ba帽贸n</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lintong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+L+F+T">Lanke Frank Tarimo Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Fallon%2C+M">Maurice Fallon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10546v1-abstract-short" style="display: inline;"> This paper introduces a large-scale multi-modal dataset captured in and around well-known landmarks in Oxford using a custom-built multi-sensor perception unit as well as a millimetre-accurate map from a Terrestrial LiDAR Scanner (TLS). The perception unit includes three synchronised global shutter colour cameras, an automotive 3D LiDAR scanner, and an inertial sensor - all precisely calibrated. W&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10546v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10546v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10546v1-abstract-full" style="display: none;"> This paper introduces a large-scale multi-modal dataset captured in and around well-known landmarks in Oxford using a custom-built multi-sensor perception unit as well as a millimetre-accurate map from a Terrestrial LiDAR Scanner (TLS). The perception unit includes three synchronised global shutter colour cameras, an automotive 3D LiDAR scanner, and an inertial sensor - all precisely calibrated. We also establish benchmarks for tasks involving localisation, reconstruction, and novel-view synthesis, which enable the evaluation of Simultaneous Localisation and Mapping (SLAM) methods, Structure-from-Motion (SfM) and Multi-view Stereo (MVS) methods as well as radiance field methods such as Neural Radiance Fields (NeRF) and 3D Gaussian Splatting. To evaluate 3D reconstruction the TLS 3D models are used as ground truth. Localisation ground truth is computed by registering the mobile LiDAR scans to the TLS 3D models. Radiance field methods are evaluated not only with poses sampled from the input trajectory, but also from viewpoints that are from trajectories which are distant from the training poses. Our evaluation demonstrates a key limitation of state-of-the-art radiance field methods: we show that they tend to overfit to the training poses/images and do not generalise well to out-of-sequence poses. They also underperform in 3D reconstruction compared to MVS systems using the same visual inputs. Our dataset and benchmarks are intended to facilitate better integration of radiance field methods and SLAM systems. The raw and processed data, along with software for parsing and evaluation, can be accessed at https://dynamic.robots.ox.ac.uk/datasets/oxford-spires/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10546v1-abstract-full').style.display = 'none'; document.getElementById('2411.10546v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://dynamic.robots.ox.ac.uk/datasets/oxford-spires/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10321">arXiv:2411.10321</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10321">pdf</a>, <a href="https://arxiv.org/format/2411.10321">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Probabilistic Prior Driven Attention Mechanism Based on Diffusion Model for Imaging Through Atmospheric Turbulence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+G">Guodong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Q">Qixiang Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liqiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongwei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Z">Zixuan Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Haotian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10321v1-abstract-short" style="display: inline;"> Atmospheric turbulence introduces severe spatial and geometric distortions, challenging traditional image restoration methods. We propose the Probabilistic Prior Turbulence Removal Network (PPTRN), which combines probabilistic diffusion-based prior modeling with Transformer-driven feature extraction to address this issue. PPTRN employs a two-stage approach: first, a latent encoder and Transformer&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10321v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10321v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10321v1-abstract-full" style="display: none;"> Atmospheric turbulence introduces severe spatial and geometric distortions, challenging traditional image restoration methods. We propose the Probabilistic Prior Turbulence Removal Network (PPTRN), which combines probabilistic diffusion-based prior modeling with Transformer-driven feature extraction to address this issue. PPTRN employs a two-stage approach: first, a latent encoder and Transformer are jointly trained on clear images to establish robust feature representations. Then, a Denoising Diffusion Probabilistic Model (DDPM) models prior distributions over latent vectors, guiding the Transformer in capturing diverse feature variations essential for restoration. A key innovation in PPTRN is the Probabilistic Prior Driven Cross Attention mechanism, which integrates the DDPM-generated prior with feature embeddings to reduce artifacts and enhance spatial coherence. Extensive experiments validate that PPTRN significantly improves restoration quality on turbulence-degraded images, setting a new benchmark in clarity and structural fidelity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10321v1-abstract-full').style.display = 'none'; document.getElementById('2411.10321v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09924">arXiv:2411.09924</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09924">pdf</a>, <a href="https://arxiv.org/format/2411.09924">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Polarization Image Dehazing Method Based on the Principle of Physical Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhenjun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+L">Lijun Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongjin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lilian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yunze He</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yaonan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09924v1-abstract-short" style="display: inline;"> Computer vision is increasingly used in areas such as unmanned vehicles, surveillance systems and remote sensing. However, in foggy scenarios, image degradation leads to loss of target details, which seriously affects the accuracy and effectiveness of these vision tasks. Polarized light, due to the fact that its electromagnetic waves vibrate in a specific direction, is able to resist scattering an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09924v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09924v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09924v1-abstract-full" style="display: none;"> Computer vision is increasingly used in areas such as unmanned vehicles, surveillance systems and remote sensing. However, in foggy scenarios, image degradation leads to loss of target details, which seriously affects the accuracy and effectiveness of these vision tasks. Polarized light, due to the fact that its electromagnetic waves vibrate in a specific direction, is able to resist scattering and refraction effects in complex media more effectively compared to unpolarized light. As a result, polarized light has a greater ability to maintain its polarization characteristics in complex transmission media and under long-distance imaging conditions. This property makes polarized imaging especially suitable for complex scenes such as outdoor and underwater, especially in foggy environments, where higher quality images can be obtained. Based on this advantage, we propose an innovative semi-physical polarization dehazing method that does not rely on an external light source. The method simulates the diffusion process of fog and designs a diffusion kernel that corresponds to the image blurriness caused by this diffusion. By employing spatiotemporal Fourier transforms and deconvolution operations, the method recovers the state of fog droplets prior to diffusion and the light inversion distribution of objects. This approach effectively achieves dehazing and detail enhancement of the scene. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09924v1-abstract-full').style.display = 'none'; document.getElementById('2411.09924v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09866">arXiv:2411.09866</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09866">pdf</a>, <a href="https://arxiv.org/ps/2411.09866">ps</a>, <a href="https://arxiv.org/format/2411.09866">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Power Allocation for Compute-and-Forward over Fading Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lanwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Evans%2C+J">Jamie Evans</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jingge Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09866v1-abstract-short" style="display: inline;"> Compute-and-forward (CF) is a relaying strategy which allows the relay to decode a linear combination of the transmitted messages. This work studies the optimal power allocation problem for the CF scheme in fast fading channels for maximizing the symmetric computation rate, which is a non-convex optimization problem with no simple analytical or numerical solutions. In the first part of the paper,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09866v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09866v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09866v1-abstract-full" style="display: none;"> Compute-and-forward (CF) is a relaying strategy which allows the relay to decode a linear combination of the transmitted messages. This work studies the optimal power allocation problem for the CF scheme in fast fading channels for maximizing the symmetric computation rate, which is a non-convex optimization problem with no simple analytical or numerical solutions. In the first part of the paper, we investigate the problem when there are finitely many channel states (discrete case). We establish several important properties of the optimal solutions and show that if all users share the same power allocation policy (symmetric policy), the optimal solution takes the form of a water-filling type when the power constraint exceeds a certain threshold. However, if asymmetric policies are allowed, the optimal solution does not take this form for any power constraint. We propose a low-complexity order-based algorithm for both scenarios and compare its performance with baseline algorithms. In the second part of the paper, we state relevant results when the channel coefficients are modelled as continuous random variables (continuous case) and propose a similar low-complexity iterative algorithm for the symmetric policy scenario. Numerical results are provided for both discrete and continuous cases. It is shown that in general our proposed algorithm finds good suboptimal solutions with low complexity, and for some examples considered, finds an exact optimal solution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09866v1-abstract-full').style.display = 'none'; document.getElementById('2411.09866v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08800">arXiv:2411.08800</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08800">pdf</a>, <a href="https://arxiv.org/format/2411.08800">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Mesoscale and Nanoscale Physics">cond-mat.mes-hall</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning Accelerated Quantum Transport Simulations in Nanoelectronics: From Break Junctions to Field-Effect Transistors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zou%2C+J">Jijie Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhouyin%2C+Z">Zhanghao Zhouyin</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+D">Dongying Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Linfeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+S">Shimin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+Q">Qiangqiang Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08800v1-abstract-short" style="display: inline;"> Quantum transport calculations are essential for understanding and designing nanoelectronic devices, yet the trade-off between accuracy and computational efficiency has long limited their practical applications. We present a general framework that combines the deep learning tight-binding Hamiltonian (DeePTB) approach with the non-equilibrium Green&#39;s Function (NEGF) method, enabling efficient quant&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08800v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08800v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08800v1-abstract-full" style="display: none;"> Quantum transport calculations are essential for understanding and designing nanoelectronic devices, yet the trade-off between accuracy and computational efficiency has long limited their practical applications. We present a general framework that combines the deep learning tight-binding Hamiltonian (DeePTB) approach with the non-equilibrium Green&#39;s Function (NEGF) method, enabling efficient quantum transport calculations while maintaining first-principles accuracy. We demonstrate the capabilities of the DeePTB-NEGF framework through two representative applications: comprehensive simulation of break junction systems, where conductance histograms show good agreement with experimental measurements in both metallic contact and single-molecule junction cases; and simulation of carbon nanotube field effect transistors through self-consistent NEGF-Poisson calculations, capturing essential physics including the electrostatic potential and transfer characteristic curves under finite bias conditions. This framework bridges the gap between first-principles accuracy and computational efficiency, providing a powerful tool for high-throughput quantum transport simulations across different scales in nanoelectronics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08800v1-abstract-full').style.display = 'none'; document.getElementById('2411.08800v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08592">arXiv:2411.08592</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08592">pdf</a>, <a href="https://arxiv.org/format/2411.08592">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Slender Object Scene Segmentation in Remote Sensing Image Based on Learnable Morphological Skeleton with Segment Anything Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xie%2C+J">Jun Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenxiao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Faqiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liqiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+Z">Zhengyang Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08592v1-abstract-short" style="display: inline;"> Morphological methods play a crucial role in remote sensing image processing, due to their ability to capture and preserve small structural details. However, most of the existing deep learning models for semantic segmentation are based on the encoder-decoder architecture including U-net and Segment Anything Model (SAM), where the downsampling process tends to discard fine details. In this paper, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08592v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08592v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08592v1-abstract-full" style="display: none;"> Morphological methods play a crucial role in remote sensing image processing, due to their ability to capture and preserve small structural details. However, most of the existing deep learning models for semantic segmentation are based on the encoder-decoder architecture including U-net and Segment Anything Model (SAM), where the downsampling process tends to discard fine details. In this paper, we propose a new approach that integrates learnable morphological skeleton prior into deep neural networks using the variational method. To address the difficulty in backpropagation in neural networks caused by the non-differentiability presented in classical morphological operations, we provide a smooth representation of the morphological skeleton and design a variational segmentation model integrating morphological skeleton prior by employing operator splitting and dual methods. Then, we integrate this model into the network architecture of SAM, which is achieved by adding a token to mask decoder and modifying the final sigmoid layer, ensuring the final segmentation results preserve the skeleton structure as much as possible. Experimental results on remote sensing datasets, including buildings and roads, demonstrate that our method outperforms the original SAM on slender object segmentation and exhibits better generalization capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08592v1-abstract-full').style.display = 'none'; document.getElementById('2411.08592v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08510">arXiv:2411.08510</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08510">pdf</a>, <a href="https://arxiv.org/format/2411.08510">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> CorrectBench: Automatic Testbench Generation with Functional Self-Correction using LLMs for HDL Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+R">Ruidi Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+G+L">Grace Li Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Drechsler%2C+R">Rolf Drechsler</a>, <a href="/search/cs?searchtype=author&amp;query=Schlichtmann%2C+U">Ulf Schlichtmann</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08510v1-abstract-short" style="display: inline;"> Functional simulation is an essential step in digital hardware design. Recently, there has been a growing interest in leveraging Large Language Models (LLMs) for hardware testbench generation tasks. However, the inherent instability associated with LLMs often leads to functional errors in the generated testbenches. Previous methods do not incorporate automatic functional correction mechanisms with&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08510v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08510v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08510v1-abstract-full" style="display: none;"> Functional simulation is an essential step in digital hardware design. Recently, there has been a growing interest in leveraging Large Language Models (LLMs) for hardware testbench generation tasks. However, the inherent instability associated with LLMs often leads to functional errors in the generated testbenches. Previous methods do not incorporate automatic functional correction mechanisms without human intervention and still suffer from low success rates, especially for sequential tasks. To address this issue, we propose CorrectBench, an automatic testbench generation framework with functional self-validation and self-correction. Utilizing only the RTL specification in natural language, the proposed approach can validate the correctness of the generated testbenches with a success rate of 88.85%. Furthermore, the proposed LLM-based corrector employs bug information obtained during the self-validation process to perform functional self-correction on the generated testbenches. The comparative analysis demonstrates that our method achieves a pass ratio of 70.13% across all evaluated tasks, compared with the previous LLM-based testbench generation framework&#39;s 52.18% and a direct LLM-based generation method&#39;s 33.33%. Specifically in sequential circuits, our work&#39;s performance is 62.18% higher than previous work in sequential tasks and almost 5 times the pass ratio of the direct method. The codes and experimental results are open-sourced at the link: https://github.com/AutoBench/CorrectBench <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08510v1-abstract-full').style.display = 'none'; document.getElementById('2411.08510v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08373">arXiv:2411.08373</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08373">pdf</a>, <a href="https://arxiv.org/format/2411.08373">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> DG-SLAM: Robust Dynamic Gaussian Splatting SLAM with Hybrid Pose Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yueming Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Haochen Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Z">Zhongyang Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+J">Jianfeng Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Li Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08373v1-abstract-short" style="display: inline;"> Achieving robust and precise pose estimation in dynamic scenes is a significant research challenge in Visual Simultaneous Localization and Mapping (SLAM). Recent advancements integrating Gaussian Splatting into SLAM systems have proven effective in creating high-quality renderings using explicit 3D Gaussian models, significantly improving environmental reconstruction fidelity. However, these appro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08373v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08373v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08373v1-abstract-full" style="display: none;"> Achieving robust and precise pose estimation in dynamic scenes is a significant research challenge in Visual Simultaneous Localization and Mapping (SLAM). Recent advancements integrating Gaussian Splatting into SLAM systems have proven effective in creating high-quality renderings using explicit 3D Gaussian models, significantly improving environmental reconstruction fidelity. However, these approaches depend on a static environment assumption and face challenges in dynamic environments due to inconsistent observations of geometry and photometry. To address this problem, we propose DG-SLAM, the first robust dynamic visual SLAM system grounded in 3D Gaussians, which provides precise camera pose estimation alongside high-fidelity reconstructions. Specifically, we propose effective strategies, including motion mask generation, adaptive Gaussian point management, and a hybrid camera tracking algorithm to improve the accuracy and robustness of pose estimation. Extensive experiments demonstrate that DG-SLAM delivers state-of-the-art performance in camera pose estimation, map reconstruction, and novel-view synthesis in dynamic scenes, outperforming existing methods meanwhile preserving real-time rendering ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08373v1-abstract-full').style.display = 'none'; document.getElementById('2411.08373v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07740">arXiv:2411.07740</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07740">pdf</a>, <a href="https://arxiv.org/format/2411.07740">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D Focusing-and-Matching Network for Multi-Instance Point Cloud Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liyuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hui%2C+L">Le Hui</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Y">Yuchao Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07740v1-abstract-short" style="display: inline;"> Multi-instance point cloud registration aims to estimate the pose of all instances of a model point cloud in the whole scene. Existing methods all adopt the strategy of first obtaining the global correspondence and then clustering to obtain the pose of each instance. However, due to the cluttered and occluded objects in the scene, it is difficult to obtain an accurate correspondence between the mo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07740v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07740v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07740v1-abstract-full" style="display: none;"> Multi-instance point cloud registration aims to estimate the pose of all instances of a model point cloud in the whole scene. Existing methods all adopt the strategy of first obtaining the global correspondence and then clustering to obtain the pose of each instance. However, due to the cluttered and occluded objects in the scene, it is difficult to obtain an accurate correspondence between the model point cloud and all instances in the scene. To this end, we propose a simple yet powerful 3D focusing-and-matching network for multi-instance point cloud registration by learning the multiple pair-wise point cloud registration. Specifically, we first present a 3D multi-object focusing module to locate the center of each object and generate object proposals. By using self-attention and cross-attention to associate the model point cloud with structurally similar objects, we can locate potential matching instances by regressing object centers. Then, we propose a 3D dual masking instance matching module to estimate the pose between the model point cloud and each object proposal. It performs instance mask and overlap mask masks to accurately predict the pair-wise correspondence. Extensive experiments on two public benchmarks, Scan2CAD and ROBI, show that our method achieves a new state-of-the-art performance on the multi-instance point cloud registration task. Code is available at https://github.com/zlynpu/3DFMNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07740v1-abstract-full').style.display = 'none'; document.getElementById('2411.07740v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07658">arXiv:2411.07658</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07658">pdf</a>, <a href="https://arxiv.org/format/2411.07658">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Advancing Sustainability via Recommender Systems: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xin Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Honglei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yixin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaoxiong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Z">Zhiqi Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07658v1-abstract-short" style="display: inline;"> Human behavioral patterns and consumption paradigms have emerged as pivotal determinants in environmental degradation and climate change, with quotidian decisions pertaining to transportation, energy utilization, and resource consumption collectively precipitating substantial ecological impacts. Recommender systems, which generate personalized suggestions based on user preferences and historical i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07658v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07658v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07658v1-abstract-full" style="display: none;"> Human behavioral patterns and consumption paradigms have emerged as pivotal determinants in environmental degradation and climate change, with quotidian decisions pertaining to transportation, energy utilization, and resource consumption collectively precipitating substantial ecological impacts. Recommender systems, which generate personalized suggestions based on user preferences and historical interaction data, exert considerable influence on individual behavioral trajectories. However, conventional recommender systems predominantly optimize for user engagement and economic metrics, inadvertently neglecting the environmental and societal ramifications of their recommendations, potentially catalyzing over-consumption and reinforcing unsustainable behavioral patterns. Given their instrumental role in shaping user decisions, there exists an imperative need for sustainable recommender systems that incorporate sustainability principles to foster eco-conscious and socially responsible choices. This comprehensive survey addresses this critical research gap by presenting a systematic analysis of sustainable recommender systems. As these systems can simultaneously advance multiple sustainability objectives--including resource conservation, sustainable consumer behavior, and social impact enhancement--examining their implementations across distinct application domains provides a more rigorous analytical framework. Through a methodological analysis of domain-specific implementations encompassing transportation, food, buildings, and auxiliary sectors, we can better elucidate how these systems holistically advance sustainability objectives while addressing sector-specific constraints and opportunities. Moreover, we delineate future research directions for evolving recommender systems beyond sustainability advocacy toward fostering environmental resilience and social consciousness in society. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07658v1-abstract-full').style.display = 'none'; document.getElementById('2411.07658v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20pages, 10 figures. Working paper: https://github.com/enoche/SusRec</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07551">arXiv:2411.07551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07551">pdf</a>, <a href="https://arxiv.org/format/2411.07551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SP-VIO: Robust and Efficient Filter-Based Visual Inertial Odometry with State Transformation Model and Pose-Only Visual Description </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Du%2C+X">Xueyu Du</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+C">Chengjun Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lilian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+X">Xinchan Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Huaiyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Maosong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+W">Wenqi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+J">Jun Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07551v1-abstract-short" style="display: inline;"> Due to the advantages of high computational efficiency and small memory requirements, filter-based visual inertial odometry (VIO) has a good application prospect in miniaturized and payload-constrained embedded systems. However, the filter-based method has the problem of insufficient accuracy. To this end, we propose the State transformation and Pose-only VIO (SP-VIO) by rebuilding the state and m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07551v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07551v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07551v1-abstract-full" style="display: none;"> Due to the advantages of high computational efficiency and small memory requirements, filter-based visual inertial odometry (VIO) has a good application prospect in miniaturized and payload-constrained embedded systems. However, the filter-based method has the problem of insufficient accuracy. To this end, we propose the State transformation and Pose-only VIO (SP-VIO) by rebuilding the state and measurement models, and considering further visual deprived conditions. In detail, we first proposed a system model based on the double state transformation extended Kalman filter (DST-EKF), which has been proven to have better observability and consistency than the models based on extended Kalman filter (EKF) and state transformation extended Kalman filter (ST-EKF). Secondly, to reduce the influence of linearization error caused by inaccurate 3D reconstruction, we adopt the Pose-only (PO) theory to decouple the measurement model from 3D features. Moreover, to deal with visual deprived conditions, we propose a double state transformation Rauch-Tung-Striebel (DST-RTS) backtracking method to optimize motion trajectories during visual interruption. Experiments on public (EuRoC, Tum-VI, KITTI) and personal datasets show that SP-VIO has better accuracy and efficiency than state-of-the-art (SOTA) VIO algorithms, and has better robustness under visual deprived conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07551v1-abstract-full').style.display = 'none'; document.getElementById('2411.07551v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07446">arXiv:2411.07446</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07446">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Accurate Prompt Optimization: the Benefit of Memory in Exemplar-Guided Reflection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yan%2C+C">Cilin Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingyun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+R">Ruihui Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiaopu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+K">Kai Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qingsong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+Y">Yangyang Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07446v1-abstract-short" style="display: inline;"> Automatic prompt engineering aims to enhance the generation quality of large language models (LLMs). Recent works utilize feedbacks generated from erroneous cases to guide the prompt optimization. During inference, they may further retrieve several semantically-related exemplars and concatenate them to the optimized prompts to improve the performance. However, those works only utilize the feedback&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07446v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07446v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07446v1-abstract-full" style="display: none;"> Automatic prompt engineering aims to enhance the generation quality of large language models (LLMs). Recent works utilize feedbacks generated from erroneous cases to guide the prompt optimization. During inference, they may further retrieve several semantically-related exemplars and concatenate them to the optimized prompts to improve the performance. However, those works only utilize the feedback at the current step, ignoring historical and unseleccted feedbacks which are potentially beneficial. Moreover, the selection of exemplars only considers the general semantic relationship and may not be optimal in terms of task performance and matching with the optimized prompt. In this work, we propose an Exemplar-Guided Reflection with Memory mechanism (ERM) to realize more efficient and accurate prompt optimization. Specifically, we design an exemplar-guided reflection mechanism where the feedback generation is additionally guided by the generated exemplars. We further build two kinds of memory to fully utilize the historical feedback information and support more effective exemplar retrieval. Empirical evaluations show our method surpasses previous state-of-the-arts with less optimization steps, i.e., improving F1 score by 10.1 on LIAR dataset, and reducing half of the optimization steps on ProTeGi. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07446v1-abstract-full').style.display = 'none'; document.getElementById('2411.07446v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07360">arXiv:2411.07360</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07360">pdf</a>, <a href="https://arxiv.org/format/2411.07360">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> ChatGPT Inaccuracy Mitigation during Technical Report Understanding: Are We There Yet? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tamanna%2C+S+B">Salma Begum Tamanna</a>, <a href="/search/cs?searchtype=author&amp;query=Uddin%2C+G">Gias Uddin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+L">Lan Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Longyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07360v1-abstract-short" style="display: inline;"> Hallucinations, the tendency to produce irrelevant/incorrect responses, are prevalent concerns in generative AI-based tools like ChatGPT. Although hallucinations in ChatGPT are studied for textual responses, it is unknown how ChatGPT hallucinates for technical texts that contain both textual and technical terms. We surveyed 47 software engineers and produced a benchmark of 412 Q&amp;A pairs from the b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07360v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07360v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07360v1-abstract-full" style="display: none;"> Hallucinations, the tendency to produce irrelevant/incorrect responses, are prevalent concerns in generative AI-based tools like ChatGPT. Although hallucinations in ChatGPT are studied for textual responses, it is unknown how ChatGPT hallucinates for technical texts that contain both textual and technical terms. We surveyed 47 software engineers and produced a benchmark of 412 Q&amp;A pairs from the bug reports of two OSS projects. We find that a RAG-based ChatGPT (i.e., ChatGPT tuned with the benchmark issue reports) is 36.4% correct when producing answers to the questions, due to two reasons 1) limitations to understand complex technical contents in code snippets like stack traces, and 2) limitations to integrate contexts denoted in the technical terms and texts. We present CHIME (ChatGPT Inaccuracy Mitigation Engine) whose underlying principle is that if we can preprocess the technical reports better and guide the query validation process in ChatGPT, we can address the observed limitations. CHIME uses context-free grammar (CFG) to parse stack traces in technical reports. CHIME then verifies and fixes ChatGPT responses by applying metamorphic testing and query transformation. In our benchmark, CHIME shows 30.3% more correction over ChatGPT responses. In a user study, we find that the improved responses with CHIME are considered more useful than those generated from ChatGPT without CHIME. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07360v1-abstract-full').style.display = 'none'; document.getElementById('2411.07360v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 47th IEEE/ACM International Conference on Software Engineering (ICSE 2025) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07021">arXiv:2411.07021</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07021">pdf</a>, <a href="https://arxiv.org/format/2411.07021">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Invar-RAG: Invariant LLM-aligned Retrieval for Better Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jianghua Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+G">Guangxu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07021v2-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) has shown impressive capability in providing reliable answer predictions and addressing hallucination problems. A typical RAG implementation uses powerful retrieval models to extract external information and large language models (LLMs) to generate answers. In contrast, recent LLM-based retrieval has gained attention for its substantial improvements in informat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07021v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07021v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07021v2-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) has shown impressive capability in providing reliable answer predictions and addressing hallucination problems. A typical RAG implementation uses powerful retrieval models to extract external information and large language models (LLMs) to generate answers. In contrast, recent LLM-based retrieval has gained attention for its substantial improvements in information retrieval (IR) due to the LLMs&#39; semantic understanding capability. However, directly applying LLM to RAG systems presents challenges. This may cause feature locality problems as massive parametric knowledge can hinder effective usage of global information across the corpus; for example, an LLM-based retriever often inputs document summaries instead of full documents. Moreover, various pre-trained tasks in LLMs introduce variance, further weakening performance as a retriever. To address these issues, we propose a novel two-stage fine-tuning architecture called Invar-RAG. In the retrieval stage, an LLM-based retriever is constructed by integrating LoRA-based representation learning to tackle feature locality issues. To enhance retrieval performance, we develop two patterns (invariant and variant patterns) and an invariance loss to reduce LLM variance. In the generation stage, a refined fine-tuning method is employed to improve LLM accuracy in generating answers based on retrieved information. Experimental results show that Invar-RAG significantly outperforms existing baselines across three open-domain question answering (ODQA) datasets. Code is available in the Supplementary Material for reproducibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07021v2-abstract-full').style.display = 'none'; document.getElementById('2411.07021v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06881">arXiv:2411.06881</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06881">pdf</a>, <a href="https://arxiv.org/format/2411.06881">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> WassFFed: Wasserstein Fair Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Han%2C+Z">Zhongxuan Han</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Chaochao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+X">Xiaolin Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+F">Fei Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06881v1-abstract-short" style="display: inline;"> Federated Learning (FL) employs a training approach to address scenarios where users&#39; data cannot be shared across clients. Achieving fairness in FL is imperative since training data in FL is inherently geographically distributed among diverse user groups. Existing research on fairness predominantly assumes access to the entire training data, making direct transfer to FL challenging. However, the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06881v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06881v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06881v1-abstract-full" style="display: none;"> Federated Learning (FL) employs a training approach to address scenarios where users&#39; data cannot be shared across clients. Achieving fairness in FL is imperative since training data in FL is inherently geographically distributed among diverse user groups. Existing research on fairness predominantly assumes access to the entire training data, making direct transfer to FL challenging. However, the limited existing research on fairness in FL does not effectively address two key challenges, i.e., (CH1) Current methods fail to deal with the inconsistency between fair optimization results obtained with surrogate functions and fair classification results. (CH2) Directly aggregating local fair models does not always yield a globally fair model due to non Identical and Independent data Distributions (non-IID) among clients. To address these challenges, we propose a Wasserstein Fair Federated Learning framework, namely WassFFed. To tackle CH1, we ensure that the outputs of local models, rather than the loss calculated with surrogate functions or classification results with a threshold, remain independent of various user groups. To resolve CH2, we employ a Wasserstein barycenter calculation of all local models&#39; outputs for each user group, bringing local model outputs closer to the global output distribution to ensure consistency between the global model and local models. We conduct extensive experiments on three real-world datasets, demonstrating that WassFFed outperforms existing approaches in striking a balance between accuracy and fairness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06881v1-abstract-full').style.display = 'none'; document.getElementById('2411.06881v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to TKDE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06409">arXiv:2411.06409</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06409">pdf</a>, <a href="https://arxiv.org/format/2411.06409">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Automated Strategy Invention for Confluence of Term Rewrite Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Mitterwallner%2C+F">Fabian Mitterwallner</a>, <a href="/search/cs?searchtype=author&amp;query=Jakubuv%2C+J">Jan Jakubuv</a>, <a href="/search/cs?searchtype=author&amp;query=Kaliszyk%2C+C">Cezary Kaliszyk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06409v1-abstract-short" style="display: inline;"> Term rewriting plays a crucial role in software verification and compiler optimization. With dozens of highly parameterizable techniques developed to prove various system properties, automatic term rewriting tools work in an extensive parameter space. This complexity exceeds human capacity for parameter selection, motivating an investigation into automated strategy invention. In this paper, we foc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06409v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06409v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06409v1-abstract-full" style="display: none;"> Term rewriting plays a crucial role in software verification and compiler optimization. With dozens of highly parameterizable techniques developed to prove various system properties, automatic term rewriting tools work in an extensive parameter space. This complexity exceeds human capacity for parameter selection, motivating an investigation into automated strategy invention. In this paper, we focus on confluence, an important property of term rewrite systems, and apply machine learning to develop the first learning-guided automatic confluence prover. Moreover, we randomly generate a large dataset to analyze confluence for term rewrite systems. Our results focus on improving the state-of-the-art automatic confluence prover CSI: When equipped with our invented strategies, it surpasses its human-designed strategies both on the augmented dataset and on the original human-created benchmark dataset Cops, proving/disproving the confluence of several term rewrite systems for which no automated proofs were known before. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06409v1-abstract-full').style.display = 'none'; document.getElementById('2411.06409v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> F.4.2; I.2.8 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06334">arXiv:2411.06334</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06334">pdf</a>, <a href="https://arxiv.org/format/2411.06334">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> A Multicast Scheme for Live Streaming Courses in Large-Scale, Geographically Dense Campus Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+S">Senxin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J">Jinlong Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Ling Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06334v1-abstract-short" style="display: inline;"> Video courses have become a significant component of modern education. However, the increasing demand for live streaming video courses places considerable strain on the service capabilities of campus networks. The challenges associated with live streaming course videos in campus network environments exhibit distinct spatial distribution characteristics. The audience for specific video courses may&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06334v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06334v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06334v1-abstract-full" style="display: none;"> Video courses have become a significant component of modern education. However, the increasing demand for live streaming video courses places considerable strain on the service capabilities of campus networks. The challenges associated with live streaming course videos in campus network environments exhibit distinct spatial distribution characteristics. The audience for specific video courses may be highly concentrated in certain areas, leading to a large number of users attempting to access the same live stream simultaneously. Utilizing a Content Delivery Network (CDN) to distribute videos in these campus scenarios creates substantial unicast pressure on edge CDN servers. This paper proposes a two-layer dynamic partitioning Recursive Bit String (RBS) virtual domain network layer multicast architecture specifically designed for large-scale, geographically dense multicast scenarios within campus networks. This approach reduces redundant multicast messages by approximately 10-30\% compared to the two-layer fixed partitioning method. Additionally, it establishes multicast source authentication capabilities based on Source Address Validation Improvement (SAVI) and facilitates secure multicast group key exchange using a concise exchange protocol within the WebRTC framework. In the next-generation data plane of programmable software-defined networks, the RBS stateless multicast technology can be integrated with the unique characteristics of large-scale, geographically dense campus network scenarios to dynamically and efficiently extend multicast coverage to every dormitory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06334v1-abstract-full').style.display = 'none'; document.getElementById('2411.06334v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06333">arXiv:2411.06333</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06333">pdf</a>, <a href="https://arxiv.org/format/2411.06333">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Learned Proximal Alternating Minimization Algorithm and Its Induced Network for a Class of Two-block Nonconvex and Nonsmooth Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yunmei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+L">Lezhi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06333v1-abstract-short" style="display: inline;"> This work proposes a general learned proximal alternating minimization algorithm, LPAM, for solving learnable two-block nonsmooth and nonconvex optimization problems. We tackle the nonsmoothness by an appropriate smoothing technique with automatic diminishing smoothing effect. For smoothed nonconvex problems we modify the proximal alternating linearized minimization (PALM) scheme by incorporating&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06333v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06333v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06333v1-abstract-full" style="display: none;"> This work proposes a general learned proximal alternating minimization algorithm, LPAM, for solving learnable two-block nonsmooth and nonconvex optimization problems. We tackle the nonsmoothness by an appropriate smoothing technique with automatic diminishing smoothing effect. For smoothed nonconvex problems we modify the proximal alternating linearized minimization (PALM) scheme by incorporating the residual learning architecture, which has proven to be highly effective in deep network training, and employing the block coordinate decent (BCD) iterates as a safeguard for the convergence of the algorithm. We prove that there is a subsequence of the iterates generated by LPAM, which has at least one accumulation point and each accumulation point is a Clarke stationary point. Our method is widely applicable as one can employ various learning problems formulated as two-block optimizations, and is also easy to be extended for solving multi-block nonsmooth and nonconvex optimization problems. The network, whose architecture follows the LPAM exactly, namely LPAM-net, inherits the convergence properties of the algorithm to make the network interpretable. As an example application of LPAM-net, we present the numerical and theoretical results on the application of LPAM-net for joint multi-modal MRI reconstruction with significantly under-sampled k-space data. The experimental results indicate the proposed LPAM-net is parameter-efficient and has favourable performance in comparison with some state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06333v1-abstract-full').style.display = 'none'; document.getElementById('2411.06333v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05961">arXiv:2411.05961</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05961">pdf</a>, <a href="https://arxiv.org/format/2411.05961">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Aligned Vector Quantization for Edge-Cloud Collabrative Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lijun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ganesan%2C+D">Deepak Ganesan</a>, <a href="/search/cs?searchtype=author&amp;query=Guan%2C+H">Hui Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05961v1-abstract-short" style="display: inline;"> Vision Language Models (VLMs) are central to Visual Question Answering (VQA) systems and are typically deployed in the cloud due to their high computational demands. However, this cloud-only approach underutilizes edge computational resources and requires significant bandwidth for transmitting raw images. In this paper, we introduce an edge-cloud collaborative VQA system, called LLaVA-AlignedVQ, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05961v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05961v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05961v1-abstract-full" style="display: none;"> Vision Language Models (VLMs) are central to Visual Question Answering (VQA) systems and are typically deployed in the cloud due to their high computational demands. However, this cloud-only approach underutilizes edge computational resources and requires significant bandwidth for transmitting raw images. In this paper, we introduce an edge-cloud collaborative VQA system, called LLaVA-AlignedVQ, which features a novel Aligned Vector Quantization algorithm (AlignedVQ) that efficiently compress intermediate features without compromising accuracy to support partitioned execution. Our experiments demonstrate that LLaVA-AlignedVQ achieves approximately 1365x compression rate of intermediate features, reducing data transmission overhead by 96.8% compared to transmitting JPEG90-compressed images to the cloud. LLaVA-AlignedVQ achieves an inference speedup of 2-15x while maintaining high accuracy, remaining within -2.23% to +1.6% of the original model&#39;s accuracy performance across eight VQA datasets, compared to the cloud-only solution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05961v1-abstract-full').style.display = 'none'; document.getElementById('2411.05961v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05750">arXiv:2411.05750</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05750">pdf</a>, <a href="https://arxiv.org/ps/2411.05750">ps</a>, <a href="https://arxiv.org/format/2411.05750">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On Differentially Private String Distances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J+Y">Jerry Yao-Chieh Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+E">Erzhi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Han Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lichen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05750v1-abstract-short" style="display: inline;"> Given a database of bit strings $A_1,\ldots,A_m\in \{0,1\}^n$, a fundamental data structure task is to estimate the distances between a given query $B\in \{0,1\}^n$ with all the strings in the database. In addition, one might further want to ensure the integrity of the database by releasing these distance statistics in a secure manner. In this work, we propose differentially private (DP) data stru&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05750v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05750v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05750v1-abstract-full" style="display: none;"> Given a database of bit strings $A_1,\ldots,A_m\in \{0,1\}^n$, a fundamental data structure task is to estimate the distances between a given query $B\in \{0,1\}^n$ with all the strings in the database. In addition, one might further want to ensure the integrity of the database by releasing these distance statistics in a secure manner. In this work, we propose differentially private (DP) data structures for this type of tasks, with a focus on Hamming and edit distance. On top of the strong privacy guarantees, our data structures are also time- and space-efficient. In particular, our data structure is $蔚$-DP against any sequence of queries of arbitrary length, and for any query $B$ such that the maximum distance to any string in the database is at most $k$, we output $m$ distance estimates. Moreover, - For Hamming distance, our data structure answers any query in $\widetilde O(mk+n)$ time and each estimate deviates from the true distance by at most $\widetilde O(k/e^{蔚/\log k})$; - For edit distance, our data structure answers any query in $\widetilde O(mk^2+n)$ time and each estimate deviates from the true distance by at most $\widetilde O(k/e^{蔚/(\log k \log n)})$. For moderate $k$, both data structures support sublinear query operations. We obtain these results via a novel adaptation of the randomized response technique as a bit flipping procedure, applied to the sketched strings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05750v1-abstract-full').style.display = 'none'; document.getElementById('2411.05750v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05212">arXiv:2411.05212</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05212">pdf</a>, <a href="https://arxiv.org/format/2411.05212">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RT-Grasp: Reasoning Tuning Robotic Grasping via Multi-modal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jinxuan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+S">Shiyu Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+Y">Yutian Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yuqian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liangjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05212v1-abstract-short" style="display: inline;"> Recent advances in Large Language Models (LLMs) have showcased their remarkable reasoning capabilities, making them influential across various fields. However, in robotics, their use has primarily been limited to manipulation planning tasks due to their inherent textual output. This paper addresses this limitation by investigating the potential of adopting the reasoning ability of LLMs for generat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05212v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05212v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05212v1-abstract-full" style="display: none;"> Recent advances in Large Language Models (LLMs) have showcased their remarkable reasoning capabilities, making them influential across various fields. However, in robotics, their use has primarily been limited to manipulation planning tasks due to their inherent textual output. This paper addresses this limitation by investigating the potential of adopting the reasoning ability of LLMs for generating numerical predictions in robotics tasks, specifically for robotic grasping. We propose Reasoning Tuning, a novel method that integrates a reasoning phase before prediction during training, leveraging the extensive prior knowledge and advanced reasoning abilities of LLMs. This approach enables LLMs, notably with multi-modal capabilities, to generate accurate numerical outputs like grasp poses that are context-aware and adaptable through conversations. Additionally, we present the Reasoning Tuning VLM Grasp dataset, carefully curated to facilitate the adaptation of LLMs to robotic grasping. Extensive validation on both grasping datasets and real-world experiments underscores the adaptability of multi-modal LLMs for numerical prediction tasks in robotics. This not only expands their applicability but also bridges the gap between text-based planning and direct robot control, thereby maximizing the potential of LLMs in robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05212v1-abstract-full').style.display = 'none'; document.getElementById('2411.05212v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IROS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04713">arXiv:2411.04713</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04713">pdf</a>, <a href="https://arxiv.org/format/2411.04713">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Reward as Condition for Instruction-based Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gu%2C+X">Xin Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Libo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+F">Fan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+L">Longyin Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+T">Tiejian Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Sijie Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04713v1-abstract-short" style="display: inline;"> High-quality training triplets (instruction, original image, edited image) are essential for instruction-based image editing. Predominant training datasets (e.g., InsPix2Pix) are created using text-to-image generative models (e.g., Stable Diffusion, DALL-E) which are not trained for image editing. Accordingly, these datasets suffer from inaccurate instruction following, poor detail preserving, and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04713v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04713v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04713v1-abstract-full" style="display: none;"> High-quality training triplets (instruction, original image, edited image) are essential for instruction-based image editing. Predominant training datasets (e.g., InsPix2Pix) are created using text-to-image generative models (e.g., Stable Diffusion, DALL-E) which are not trained for image editing. Accordingly, these datasets suffer from inaccurate instruction following, poor detail preserving, and generation artifacts. In this paper, we propose to address the training data quality issue with multi-perspective reward data instead of refining the ground-truth image quality. 1) we first design a quantitative metric system based on best-in-class LVLM (Large Vision Language Model), i.e., GPT-4o in our case, to evaluate the generation quality from 3 perspectives, namely, instruction following, detail preserving, and generation quality. For each perspective, we collected quantitative score in $0\sim 5$ and text descriptive feedback on the specific failure points in ground-truth edited images, resulting in a high-quality editing reward dataset, i.e., RewardEdit20K. 2) We further proposed a novel training framework to seamlessly integrate the metric output, regarded as multi-reward, into editing models to learn from the imperfect training triplets. During training, the reward scores and text descriptions are encoded as embeddings and fed into both the latent space and the U-Net of the editing models as auxiliary conditions. During inference, we set these additional conditions to the highest score with no text description for failure points, to aim at the best generation outcome. Experiments indicate that our multi-reward conditioned model outperforms its no-reward counterpart on two popular editing pipelines, i.e., InsPix2Pix and SmartEdit. The code and dataset will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04713v1-abstract-full').style.display = 'none'; document.getElementById('2411.04713v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04571">arXiv:2411.04571</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04571">pdf</a>, <a href="https://arxiv.org/format/2411.04571">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DomainGallery: Few-shot Domain-driven Image Generation by Attribute-centric Finetuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Duan%2C+Y">Yuxuan Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+Y">Yan Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+J">Jun Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+H">Huijia Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Weiqiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianfu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+L">Li Niu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liqing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04571v1-abstract-short" style="display: inline;"> The recent progress in text-to-image models pretrained on large-scale datasets has enabled us to generate various images as long as we provide a text prompt describing what we want. Nevertheless, the availability of these models is still limited when we expect to generate images that fall into a specific domain either hard to describe or just unseen to the models. In this work, we propose DomainGa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04571v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04571v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04571v1-abstract-full" style="display: none;"> The recent progress in text-to-image models pretrained on large-scale datasets has enabled us to generate various images as long as we provide a text prompt describing what we want. Nevertheless, the availability of these models is still limited when we expect to generate images that fall into a specific domain either hard to describe or just unseen to the models. In this work, we propose DomainGallery, a few-shot domain-driven image generation method which aims at finetuning pretrained Stable Diffusion on few-shot target datasets in an attribute-centric manner. Specifically, DomainGallery features prior attribute erasure, attribute disentanglement, regularization and enhancement. These techniques are tailored to few-shot domain-driven generation in order to solve key issues that previous works have failed to settle. Extensive experiments are given to validate the superior performance of DomainGallery on a variety of domain-driven generation scenarios. Codes are available at https://github.com/Ldhlwh/DomainGallery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04571v1-abstract-full').style.display = 'none'; document.getElementById('2411.04571v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04494">arXiv:2411.04494</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04494">pdf</a>, <a href="https://arxiv.org/format/2411.04494">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Online Omnidirectional Jumping Trajectory Planning for Quadrupedal Robots on Uneven Terrains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yue%2C+L">Linzhu Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Z">Zhitao Song</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+J">Jinhu Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhongyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hongbo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lingwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+X">Xuanqi Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Sreenath%2C+K">Koushil Sreenath</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yun-hui Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04494v2-abstract-short" style="display: inline;"> Natural terrain complexity often necessitates agile movements like jumping in animals to improve traversal efficiency. To enable similar capabilities in quadruped robots, complex real-time jumping maneuvers are required. Current research does not adequately address the problem of online omnidirectional jumping and neglects the robot&#39;s kinodynamic constraints during trajectory generation. This pape&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04494v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04494v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04494v2-abstract-full" style="display: none;"> Natural terrain complexity often necessitates agile movements like jumping in animals to improve traversal efficiency. To enable similar capabilities in quadruped robots, complex real-time jumping maneuvers are required. Current research does not adequately address the problem of online omnidirectional jumping and neglects the robot&#39;s kinodynamic constraints during trajectory generation. This paper proposes a general and complete cascade online optimization framework for omnidirectional jumping for quadruped robots. Our solution systematically encompasses jumping trajectory generation, a trajectory tracking controller, and a landing controller. It also incorporates environmental perception to navigate obstacles that standard locomotion cannot bypass, such as jumping from high platforms. We introduce a novel jumping plane to parameterize omnidirectional jumping motion and formulate a tightly coupled optimization problem accounting for the kinodynamic constraints, simultaneously optimizing CoM trajectory, Ground Reaction Forces (GRFs), and joint states. To meet the online requirements, we propose an accelerated evolutionary algorithm as the trajectory optimizer to address the complexity of kinodynamic constraints. To ensure stability and accuracy in environmental perception post-landing, we introduce a coarse-to-fine relocalization method that combines global Branch and Bound (BnB) search with Maximum a Posteriori (MAP) estimation for precise positioning during navigation and jumping. The proposed framework achieves jump trajectory generation in approximately 0.1 seconds with a warm start and has been successfully validated on two quadruped robots on uneven terrains. Additionally, we extend the framework&#39;s versatility to humanoid robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04494v2-abstract-full').style.display = 'none'; document.getElementById('2411.04494v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IJRR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04413">arXiv:2411.04413</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04413">pdf</a>, <a href="https://arxiv.org/format/2411.04413">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Seeing Through Pixel Motion: Learning Obstacle Avoidance from Optical Flow with One Camera </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yu Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yuang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yunlong Song</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Y">Yang Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Linzuo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+W">Weiyao Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+D">Danping Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Wenxian Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04413v1-abstract-short" style="display: inline;"> Optical flow captures the motion of pixels in an image sequence over time, providing information about movement, depth, and environmental structure. Flying insects utilize this information to navigate and avoid obstacles, allowing them to execute highly agile maneuvers even in complex environments. Despite its potential, autonomous flying robots have yet to fully leverage this motion information t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04413v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04413v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04413v1-abstract-full" style="display: none;"> Optical flow captures the motion of pixels in an image sequence over time, providing information about movement, depth, and environmental structure. Flying insects utilize this information to navigate and avoid obstacles, allowing them to execute highly agile maneuvers even in complex environments. Despite its potential, autonomous flying robots have yet to fully leverage this motion information to achieve comparable levels of agility and robustness. Challenges of control from optical flow include extracting accurate optical flow at high speeds, handling noisy estimation, and ensuring robust performance in complex environments. To address these challenges, we propose a novel end-to-end system for quadrotor obstacle avoidance using monocular optical flow. We develop an efficient differentiable simulator coupled with a simplified quadrotor model, allowing our policy to be trained directly through first-order gradient optimization. Additionally, we introduce a central flow attention mechanism and an action-guided active sensing strategy that enhances the policy&#39;s focus on task-relevant optical flow observations to enable more responsive decision-making during flight. Our system is validated both in simulation and the real world using an FPV racing drone. Despite being trained in a simple environment in simulation, our system is validated both in simulation and the real world using an FPV racing drone. Despite being trained in a simple environment in simulation, our system demonstrates agile and robust flight in various unknown, cluttered environments in the real world at speeds of up to 6m/s. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04413v1-abstract-full').style.display = 'none'; document.getElementById('2411.04413v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03914">arXiv:2411.03914</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03914">pdf</a>, <a href="https://arxiv.org/format/2411.03914">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Game-Theoretic Machine Unlearning: Mitigating Extra Privacy Leakage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hengzhu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+T">Tianqing Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lefeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+P">Ping Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03914v1-abstract-short" style="display: inline;"> With the extensive use of machine learning technologies, data providers encounter increasing privacy risks. Recent legislation, such as GDPR, obligates organizations to remove requested data and its influence from a trained model. Machine unlearning is an emerging technique designed to enable machine learning models to erase users&#39; private information. Although several efficient machine unlearning&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03914v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03914v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03914v1-abstract-full" style="display: none;"> With the extensive use of machine learning technologies, data providers encounter increasing privacy risks. Recent legislation, such as GDPR, obligates organizations to remove requested data and its influence from a trained model. Machine unlearning is an emerging technique designed to enable machine learning models to erase users&#39; private information. Although several efficient machine unlearning schemes have been proposed, these methods still have limitations. First, removing the contributions of partial data may lead to model performance degradation. Second, discrepancies between the original and generated unlearned models can be exploited by attackers to obtain target sample&#39;s information, resulting in additional privacy leakage risks. To address above challenges, we proposed a game-theoretic machine unlearning algorithm that simulates the competitive relationship between unlearning performance and privacy protection. This algorithm comprises unlearning and privacy modules. The unlearning module possesses a loss function composed of model distance and classification error, which is used to derive the optimal strategy. The privacy module aims to make it difficult for an attacker to infer membership information from the unlearned data, thereby reducing the privacy leakage risk during the unlearning process. Additionally, the experimental results on real-world datasets demonstrate that this game-theoretic unlearning algorithm&#39;s effectiveness and its ability to generate an unlearned model with a performance similar to that of the retrained one while mitigating extra privacy leakage risks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03914v1-abstract-full').style.display = 'none'; document.getElementById('2411.03914v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03844">arXiv:2411.03844</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03844">pdf</a>, <a href="https://arxiv.org/format/2411.03844">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Attribute-Based Encryption With Payable Outsourced Decryption Using Blockchain and Responsive Zero Knowledge Proof </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cai%2C+D">Dongliang Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Borui Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kexin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Kan%2C+H">Haibin Kan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03844v1-abstract-short" style="display: inline;"> Attribute-Based Encryption (ABE) is a promising solution for access control in cloud services. However, the heavy decryption overhead hinders its widespread adoption. A general approach to address this issue is to outsource decryption to decryption cloud service(DCS). Existing schemes have utilized various methods to enable users to verify outsourced results; however, they lack an effective mechan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03844v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03844v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03844v1-abstract-full" style="display: none;"> Attribute-Based Encryption (ABE) is a promising solution for access control in cloud services. However, the heavy decryption overhead hinders its widespread adoption. A general approach to address this issue is to outsource decryption to decryption cloud service(DCS). Existing schemes have utilized various methods to enable users to verify outsourced results; however, they lack an effective mechanism to achieve exemptibility which enables the honest DCS to escape from wrong claims. And it is impractical to assume that the DCS will provide free services. In this paper, we propose a blockchain-based payable outsourced decryption ABE scheme that achieves both verifiability and exemptibility without adding redundant information to ABE ciphertext. We use zero-knowledge proof to verify outsourced results on blockchain and introduce an optional single-round challenge game under optimistic assumption to address the high cost of proof generation. Moreover, our system achieves fairness and decentralized outsourcing to protect the interests of all parties. Finally, we implement and evaluate our scheme on Ethereum to demonstrate its feasibility and efficiency, the gas usage in attribute numbers from 5 to 60 is 11$\times$ to 140$\times$ in the happy case and 4$\times$ to 55$\times$ in the challenge case lower than the scheme of Ge et al. (TDSC&#39;23). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03844v1-abstract-full').style.display = 'none'; document.getElementById('2411.03844v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03752">arXiv:2411.03752</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03752">pdf</a>, <a href="https://arxiv.org/format/2411.03752">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deferred Poisoning: Making the Model More Vulnerable via Hessian Singularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yuhao He</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+J">Jinyu Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+X">Xianwei Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+L">Li Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuanman Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L+Y">Leo Yu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jiantao Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03752v1-abstract-short" style="display: inline;"> Recent studies have shown that deep learning models are very vulnerable to poisoning attacks. Many defense methods have been proposed to address this issue. However, traditional poisoning attacks are not as threatening as commonly believed. This is because they often cause differences in how the model performs on the training set compared to the validation set. Such inconsistency can alert defende&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03752v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03752v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03752v1-abstract-full" style="display: none;"> Recent studies have shown that deep learning models are very vulnerable to poisoning attacks. Many defense methods have been proposed to address this issue. However, traditional poisoning attacks are not as threatening as commonly believed. This is because they often cause differences in how the model performs on the training set compared to the validation set. Such inconsistency can alert defenders that their data has been poisoned, allowing them to take the necessary defensive actions. In this paper, we introduce a more threatening type of poisoning attack called the Deferred Poisoning Attack. This new attack allows the model to function normally during the training and validation phases but makes it very sensitive to evasion attacks or even natural noise. We achieve this by ensuring the poisoned model&#39;s loss function has a similar value as a normally trained model at each input sample but with a large local curvature. A similar model loss ensures that there is no obvious inconsistency between the training and validation accuracy, demonstrating high stealthiness. On the other hand, the large curvature implies that a small perturbation may cause a significant increase in model loss, leading to substantial performance degradation, which reflects a worse robustness. We fulfill this purpose by making the model have singular Hessian information at the optimal point via our proposed Singularization Regularization term. We have conducted both theoretical and empirical analyses of the proposed method and validated its effectiveness through experiments on image classification tasks. Furthermore, we have confirmed the hazards of this form of poisoning attack under more general scenarios using natural noise, offering a new perspective for research in the field of security. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03752v1-abstract-full').style.display = 'none'; document.getElementById('2411.03752v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03688">arXiv:2411.03688</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03688">pdf</a>, <a href="https://arxiv.org/format/2411.03688">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Where Do We Stand with Implicit Neural Representations? A Technical and Performance Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Essakine%2C+A">Amer Essakine</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yanqi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+C">Chun-Wun Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lipei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Z">Zhongying Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+L">Lei Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Sch%C3%B6nlieb%2C+C">Carola-Bibiane Sch枚nlieb</a>, <a href="/search/cs?searchtype=author&amp;query=Aviles-Rivero%2C+A+I">Angelica I Aviles-Rivero</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03688v1-abstract-short" style="display: inline;"> Implicit Neural Representations (INRs) have emerged as a paradigm in knowledge representation, offering exceptional flexibility and performance across a diverse range of applications. INRs leverage multilayer perceptrons (MLPs) to model data as continuous implicit functions, providing critical advantages such as resolution independence, memory efficiency, and generalisation beyond discretised data&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03688v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03688v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03688v1-abstract-full" style="display: none;"> Implicit Neural Representations (INRs) have emerged as a paradigm in knowledge representation, offering exceptional flexibility and performance across a diverse range of applications. INRs leverage multilayer perceptrons (MLPs) to model data as continuous implicit functions, providing critical advantages such as resolution independence, memory efficiency, and generalisation beyond discretised data structures. Their ability to solve complex inverse problems makes them particularly effective for tasks including audio reconstruction, image representation, 3D object reconstruction, and high-dimensional data synthesis. This survey provides a comprehensive review of state-of-the-art INR methods, introducing a clear taxonomy that categorises them into four key areas: activation functions, position encoding, combined strategies, and network structure optimisation. We rigorously analyse their critical properties, such as full differentiability, smoothness, compactness, and adaptability to varying resolutions while also examining their strengths and limitations in addressing locality biases and capturing fine details. Our experimental comparison offers new insights into the trade-offs between different approaches, showcasing the capabilities and challenges of the latest INR techniques across various tasks. In addition to identifying areas where current methods excel, we highlight key limitations and potential avenues for improvement, such as developing more expressive activation functions, enhancing positional encoding mechanisms, and improving scalability for complex, high-dimensional data. This survey serves as a roadmap for researchers, offering practical guidance for future exploration in the field of INRs. We aim to foster new methodologies by outlining promising research directions for INRs and applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03688v1-abstract-full').style.display = 'none'; document.getElementById('2411.03688v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03360">arXiv:2411.03360</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03360">pdf</a>, <a href="https://arxiv.org/format/2411.03360">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Pedestrian Volume Prediction Using a Diffusion Convolutional Gated Recurrent Unit Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yiwei Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+T">Tingjin Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lele Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ghaderi%2C+H">Hadi Ghaderi</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hanfang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03360v1-abstract-short" style="display: inline;"> Effective models for analysing and predicting pedestrian flow are important to ensure the safety of both pedestrians and other road users. These tools also play a key role in optimising infrastructure design and geometry and supporting the economic utility of interconnected communities. The implementation of city-wide automatic pedestrian counting systems provides researchers with invaluable data,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03360v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03360v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03360v1-abstract-full" style="display: none;"> Effective models for analysing and predicting pedestrian flow are important to ensure the safety of both pedestrians and other road users. These tools also play a key role in optimising infrastructure design and geometry and supporting the economic utility of interconnected communities. The implementation of city-wide automatic pedestrian counting systems provides researchers with invaluable data, enabling the development and training of deep learning applications that offer better insights into traffic and crowd flows. Benefiting from real-world data provided by the City of Melbourne pedestrian counting system, this study presents a pedestrian flow prediction model, as an extension of Diffusion Convolutional Grated Recurrent Unit (DCGRU) with dynamic time warping, named DCGRU-DTW. This model captures the spatial dependencies of pedestrian flow through the diffusion process and the temporal dependency captured by Gated Recurrent Unit (GRU). Through extensive numerical experiments, we demonstrate that the proposed model outperforms the classic vector autoregressive model and the original DCGRU across multiple model accuracy metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03360v1-abstract-full').style.display = 'none'; document.getElementById('2411.03360v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03027">arXiv:2411.03027</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03027">pdf</a>, <a href="https://arxiv.org/format/2411.03027">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Genetic Selection based Pinning Control with Asymmetric Coupling for Multi-Network Heterogeneous Vehicular Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+W">Weian Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Sha%2C+R">Ruizhi Sha</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dongyang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03027v1-abstract-short" style="display: inline;"> To alleviate computational load on RSUs and cloud platforms, reduce communication bandwidth requirements, and provide a more stable vehicular network service, this paper proposes an optimized pinning control approach for heterogeneous multi-network vehicular ad-hoc networks (VANETs). In such networks, vehicles participate in multiple task-specific networks with asymmetric coupling and dynamic topo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03027v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03027v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03027v1-abstract-full" style="display: none;"> To alleviate computational load on RSUs and cloud platforms, reduce communication bandwidth requirements, and provide a more stable vehicular network service, this paper proposes an optimized pinning control approach for heterogeneous multi-network vehicular ad-hoc networks (VANETs). In such networks, vehicles participate in multiple task-specific networks with asymmetric coupling and dynamic topologies. We first establish a rigorous theoretical foundation by proving the stability of pinning control strategies under both single and multi-network conditions, deriving sufficient stability conditions using Lyapunov theory and linear matrix inequalities (LMIs). Building on this theoretical groundwork, we propose an adaptive genetic algorithm tailored to select optimal pinning nodes, effectively balancing LMI constraints while prioritizing overlapping nodes to enhance control efficiency. Extensive simulations across various network scales demonstrate that our approach achieves rapid consensus with a reduced number of control nodes, particularly when leveraging network overlaps. This work provides a comprehensive solution for efficient control node selection in complex vehicular networks, offering practical implications for deploying large-scale intelligent transportation systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03027v1-abstract-full').style.display = 'none'; document.getElementById('2411.03027v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02809">arXiv:2411.02809</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02809">pdf</a>, <a href="https://arxiv.org/format/2411.02809">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Query-Efficient Adversarial Attack Against Vertical Federated Graph Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jinyin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%2C+W">Wenbo Mu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Luxin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+G">Guohan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+H">Haibin Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yao Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02809v1-abstract-short" style="display: inline;"> Graph neural network (GNN) has captured wide attention due to its capability of graph representation learning for graph-structured data. However, the distributed data silos limit the performance of GNN. Vertical federated learning (VFL), an emerging technique to process distributed data, successfully makes GNN possible to handle the distributed graph-structured data. Despite the prosperous develop&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02809v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02809v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02809v1-abstract-full" style="display: none;"> Graph neural network (GNN) has captured wide attention due to its capability of graph representation learning for graph-structured data. However, the distributed data silos limit the performance of GNN. Vertical federated learning (VFL), an emerging technique to process distributed data, successfully makes GNN possible to handle the distributed graph-structured data. Despite the prosperous development of vertical federated graph learning (VFGL), the robustness of VFGL against the adversarial attack has not been explored yet. Although numerous adversarial attacks against centralized GNNs are proposed, their attack performance is challenged in the VFGL scenario. To the best of our knowledge, this is the first work to explore the adversarial attack against VFGL. A query-efficient hybrid adversarial attack framework is proposed to significantly improve the centralized adversarial attacks against VFGL, denoted as NA2, short for Neuron-based Adversarial Attack. Specifically, a malicious client manipulates its local training data to improve its contribution in a stealthy fashion. Then a shadow model is established based on the manipulated data to simulate the behavior of the server model in VFGL. As a result, the shadow model can improve the attack success rate of various centralized attacks with a few queries. Extensive experiments on five real-world benchmarks demonstrate that NA2 improves the performance of the centralized adversarial attacks against VFGL, achieving state-of-the-art performance even under potential adaptive defense where the defender knows the attack method. Additionally, we provide interpretable experiments of the effectiveness of NA2 via sensitive neurons identification and visualization of t-SNE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02809v1-abstract-full').style.display = 'none'; document.getElementById('2411.02809v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02793">arXiv:2411.02793</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02793">pdf</a>, <a href="https://arxiv.org/format/2411.02793">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Toward Robust Incomplete Multimodal Sentiment Analysis via Hierarchical Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Mingcheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Dingkang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shunli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiawei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuaibing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+J">Jinjie Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yue Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Q">Qingyao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+X">Xiaolu Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingyang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+Z">Ziyun Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Kou%2C+D">Dongliang Kou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lihua Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02793v1-abstract-short" style="display: inline;"> Multimodal Sentiment Analysis (MSA) is an important research area that aims to understand and recognize human sentiment through multiple modalities. The complementary information provided by multimodal fusion promotes better sentiment analysis compared to utilizing only a single modality. Nevertheless, in real-world applications, many unavoidable factors may lead to situations of uncertain modalit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02793v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02793v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02793v1-abstract-full" style="display: none;"> Multimodal Sentiment Analysis (MSA) is an important research area that aims to understand and recognize human sentiment through multiple modalities. The complementary information provided by multimodal fusion promotes better sentiment analysis compared to utilizing only a single modality. Nevertheless, in real-world applications, many unavoidable factors may lead to situations of uncertain modality missing, thus hindering the effectiveness of multimodal modeling and degrading the model&#39;s performance. To this end, we propose a Hierarchical Representation Learning Framework (HRLF) for the MSA task under uncertain missing modalities. Specifically, we propose a fine-grained representation factorization module that sufficiently extracts valuable sentiment information by factorizing modality into sentiment-relevant and modality-specific representations through crossmodal translation and sentiment semantic reconstruction. Moreover, a hierarchical mutual information maximization mechanism is introduced to incrementally maximize the mutual information between multi-scale representations to align and reconstruct the high-level semantics in the representations. Ultimately, we propose a hierarchical adversarial learning mechanism that further aligns and adapts the latent distribution of sentiment-relevant representations to produce robust joint multimodal representations. Comprehensive experiments on three datasets demonstrate that HRLF significantly improves MSA performance under uncertain modality missing cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02793v1-abstract-full').style.display = 'none'; document.getElementById('2411.02793v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02603">arXiv:2411.02603</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02603">pdf</a>, <a href="https://arxiv.org/format/2411.02603">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> FactTest: Factuality Testing in Large Language Models with Finite-Sample and Distribution-Free Guarantees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+F">Fan Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+X">Xiaotian Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+S">Shuhang Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+J">James Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+H">Huaxiu Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Linjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02603v3-abstract-short" style="display: inline;"> The propensity of Large Language Models (LLMs) to generate hallucinations and non-factual content undermines their reliability in high-stakes domains, where rigorous control over Type I errors (the conditional probability of incorrectly classifying hallucinations as truthful content) is essential. Despite its importance, formal verification of LLM factuality with such guarantees remains largely un&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02603v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02603v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02603v3-abstract-full" style="display: none;"> The propensity of Large Language Models (LLMs) to generate hallucinations and non-factual content undermines their reliability in high-stakes domains, where rigorous control over Type I errors (the conditional probability of incorrectly classifying hallucinations as truthful content) is essential. Despite its importance, formal verification of LLM factuality with such guarantees remains largely unexplored. In this paper, we introduce FactTest, a novel framework that statistically assesses whether a LLM can confidently provide correct answers to given questions with high-probability correctness guarantees. We formulate factuality testing as hypothesis testing problem to enforce an upper bound of Type I errors at user-specified significance levels. Notably, we prove that our framework also ensures strong Type II error control under mild conditions and can be extended to maintain its effectiveness when covariate shifts exist. Our approach is distribution-free and works for any number of human-annotated samples. It is model-agnostic and applies to any black-box or white-box LM. Extensive experiments on question-answering (QA) and multiple-choice benchmarks demonstrate that FactTest effectively detects hallucinations and improves the model&#39;s ability to abstain from answering unknown questions, leading to an over 40% accuracy improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02603v3-abstract-full').style.display = 'none'; document.getElementById('2411.02603v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhang%2C+L&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10