Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 778 results for author: <span class="mathjax">Yang, G</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yang%2C+G">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yang, G"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yang%2C+G&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yang, G"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+G&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+G&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+G&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19411">arXiv:2502.19411</a> <span> [<a href="https://arxiv.org/pdf/2502.19411">pdf</a>, <a href="https://arxiv.org/format/2502.19411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Code to Think, Think to Code: A Survey on Code-Enhanced Reasoning and Reasoning-Driven Code Intelligence in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dayu Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Daoan Zhang</a>, <a href="/search/cs?searchtype=author&query=Simoulin%2C+A">Antoine Simoulin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuwei Cao</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+Z">Zhaopu Teng</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+X">Xin Qian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Grey Yang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&query=McAuley%2C+J">Julian McAuley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19411v1-abstract-short" style="display: inline;"> In large language models (LLMs), code and reasoning reinforce each other: code offers an abstract, modular, and logic-driven structure that supports reasoning, while reasoning translates high-level goals into smaller, executable steps that drive more advanced code intelligence. In this study, we examine how code serves as a structured medium for enhancing reasoning: it provides verifiable executio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19411v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19411v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19411v1-abstract-full" style="display: none;"> In large language models (LLMs), code and reasoning reinforce each other: code offers an abstract, modular, and logic-driven structure that supports reasoning, while reasoning translates high-level goals into smaller, executable steps that drive more advanced code intelligence. In this study, we examine how code serves as a structured medium for enhancing reasoning: it provides verifiable execution paths, enforces logical decomposition, and enables runtime validation. We also explore how improvements in reasoning have transformed code intelligence from basic completion to advanced capabilities, enabling models to address complex software engineering tasks through planning and debugging. Finally, we identify key challenges and propose future research directions to strengthen this synergy, ultimately improving LLM's performance in both areas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19411v1-abstract-full').style.display = 'none'; document.getElementById('2502.19411v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Repo: https://github.com/dayuyang1999/Awesome-Code-Reasoning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13061">arXiv:2502.13061</a> <span> [<a href="https://arxiv.org/pdf/2502.13061">pdf</a>, <a href="https://arxiv.org/format/2502.13061">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improved Fine-Tuning of Large Multimodal Models for Hateful Meme Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jingbiao Mei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinghong Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guangyu Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weizhe Lin</a>, <a href="/search/cs?searchtype=author&query=Byrne%2C+B">Bill Byrne</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13061v1-abstract-short" style="display: inline;"> Hateful memes have become a significant concern on the Internet, necessitating robust automated detection systems. While large multimodal models have shown strong generalization across various tasks, they exhibit poor generalization to hateful meme detection due to the dynamic nature of memes tied to emerging social trends and breaking news. Recent work further highlights the limitations of conven… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13061v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13061v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13061v1-abstract-full" style="display: none;"> Hateful memes have become a significant concern on the Internet, necessitating robust automated detection systems. While large multimodal models have shown strong generalization across various tasks, they exhibit poor generalization to hateful meme detection due to the dynamic nature of memes tied to emerging social trends and breaking news. Recent work further highlights the limitations of conventional supervised fine-tuning for large multimodal models in this context. To address these challenges, we propose Large Multimodal Model Retrieval-Guided Contrastive Learning (LMM-RGCL), a novel two-stage fine-tuning framework designed to improve both in-domain accuracy and cross-domain generalization. Experimental results on six widely used meme classification datasets demonstrate that LMM-RGCL achieves state-of-the-art performance, outperforming agent-based systems such as VPD-PALI-X-55B. Furthermore, our method effectively generalizes to out-of-domain memes under low-resource settings, surpassing models like GPT-4o. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13061v1-abstract-full').style.display = 'none'; document.getElementById('2502.13061v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12449">arXiv:2502.12449</a> <span> [<a href="https://arxiv.org/pdf/2502.12449">pdf</a>, <a href="https://arxiv.org/format/2502.12449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> YUNet: Improved YOLOv11 Network for Skyline Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gang Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Miao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Quan Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiangchuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12449v1-abstract-short" style="display: inline;"> Skyline detection plays an important role in geolocalizaion, flight control, visual navigation, port security, etc. The appearance of the sky and non-sky areas are variable, because of different weather or illumination environment, which brings challenges to skyline detection. In this research, we proposed the YUNet algorithm, which improved the YOLOv11 architecture to segment the sky region and e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12449v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12449v1-abstract-full" style="display: none;"> Skyline detection plays an important role in geolocalizaion, flight control, visual navigation, port security, etc. The appearance of the sky and non-sky areas are variable, because of different weather or illumination environment, which brings challenges to skyline detection. In this research, we proposed the YUNet algorithm, which improved the YOLOv11 architecture to segment the sky region and extract the skyline in complicated and variable circumstances. To improve the ability of multi-scale and large range contextual feature fusion, the YOLOv11 architecture is extended as an UNet-like architecture, consisting of an encoder, neck and decoder submodule. The encoder extracts the multi-scale features from the given images. The neck makes fusion of these multi-scale features. The decoder applies the fused features to complete the prediction rebuilding. To validate the proposed approach, the YUNet was tested on Skyfinder and CH1 datasets for segmentation and skyline detection respectively. Our test shows that the IoU of YUnet segmentation can reach 0.9858, and the average error of YUnet skyline detection is just 1.36 pixels. The implementation is published at https://github.com/kuazhangxiaoai/SkylineDet-YOLOv11Seg.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12449v1-abstract-full').style.display = 'none'; document.getElementById('2502.12449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11909">arXiv:2502.11909</a> <span> [<a href="https://arxiv.org/pdf/2502.11909">pdf</a>, <a href="https://arxiv.org/format/2502.11909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neural Guided Diffusion Bridges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gefan Yang</a>, <a href="/search/cs?searchtype=author&query=van+der+Meulen%2C+F">Frank van der Meulen</a>, <a href="/search/cs?searchtype=author&query=Sommer%2C+S">Stefan Sommer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11909v2-abstract-short" style="display: inline;"> We propose a novel method for simulating conditioned diffusion processes (diffusion bridges) in Euclidean spaces. By training a neural network to approximate bridge dynamics, our approach eliminates the need for computationally intensive Markov Chain Monte Carlo (MCMC) methods or reverse-process modeling. Compared to existing methods, it offers greater robustness across various diffusion specifica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11909v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11909v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11909v2-abstract-full" style="display: none;"> We propose a novel method for simulating conditioned diffusion processes (diffusion bridges) in Euclidean spaces. By training a neural network to approximate bridge dynamics, our approach eliminates the need for computationally intensive Markov Chain Monte Carlo (MCMC) methods or reverse-process modeling. Compared to existing methods, it offers greater robustness across various diffusion specifications and conditioning scenarios. This applies in particular to rare events and multimodal distributions, which pose challenges for score-learning- and MCMC-based approaches. We propose a flexible variational family for approximating the diffusion bridge path measure which is partially specified by a neural network. Once trained, it enables efficient independent sampling at a cost comparable to sampling the unconditioned (forward) process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11909v2-abstract-full').style.display = 'none'; document.getElementById('2502.11909v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11358">arXiv:2502.11358</a> <span> [<a href="https://arxiv.org/pdf/2502.11358">pdf</a>, <a href="https://arxiv.org/format/2502.11358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Mimicking the Familiar: Dynamic Command Generation for Information Theft Attacks in LLM Tool-Learning System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyou Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingyang Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guowei Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junjie Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuekai Huang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Z">Zhiyuan Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11358v1-abstract-short" style="display: inline;"> Information theft attacks pose a significant risk to Large Language Model (LLM) tool-learning systems. Adversaries can inject malicious commands through compromised tools, manipulating LLMs to send sensitive information to these tools, which leads to potential privacy breaches. However, existing attack approaches are black-box oriented and rely on static commands that cannot adapt flexibly to the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11358v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11358v1-abstract-full" style="display: none;"> Information theft attacks pose a significant risk to Large Language Model (LLM) tool-learning systems. Adversaries can inject malicious commands through compromised tools, manipulating LLMs to send sensitive information to these tools, which leads to potential privacy breaches. However, existing attack approaches are black-box oriented and rely on static commands that cannot adapt flexibly to the changes in user queries and the invocation chain of tools. It makes malicious commands more likely to be detected by LLM and leads to attack failure. In this paper, we propose AutoCMD, a dynamic attack comment generation approach for information theft attacks in LLM tool-learning systems. Inspired by the concept of mimicking the familiar, AutoCMD is capable of inferring the information utilized by upstream tools in the toolchain through learning on open-source systems and reinforcement with target system examples, thereby generating more targeted commands for information theft. The evaluation results show that AutoCMD outperforms the baselines with +13.2% $ASR_{Theft}$, and can be generalized to new tool-learning systems to expose their information leakage risks. We also design four defense methods to effectively protect tool-learning systems from the attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11358v1-abstract-full').style.display = 'none'; document.getElementById('2502.11358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10248">arXiv:2502.10248</a> <span> [<a href="https://arxiv.org/pdf/2502.10248">pdf</a>, <a href="https://arxiv.org/format/2502.10248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guoqing Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyang Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kun Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shengming Yin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+R">Ranchen Ming</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoniu Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Deshan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Deyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K">Kaijun Tan</a>, <a href="/search/cs?searchtype=author&query=An%2C+K">Kang An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mei Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiling Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wen Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xin Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yanan Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zheng Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aojie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a> , et al. (90 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10248v3-abstract-short" style="display: inline;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v3-abstract-full').style.display = 'inline'; document.getElementById('2502.10248v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10248v3-abstract-full" style="display: none;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded using two bilingual text encoders to handle both English and Chinese. A DiT with 3D full attention is trained using Flow Matching and is employed to denoise input noise into latent frames. A video-based DPO approach, Video-DPO, is applied to reduce artifacts and improve the visual quality of the generated videos. We also detail our training strategies and share key observations and insights. Step-Video-T2V's performance is evaluated on a novel video generation benchmark, Step-Video-T2V-Eval, demonstrating its state-of-the-art text-to-video quality when compared with both open-source and commercial engines. Additionally, we discuss the limitations of current diffusion-based model paradigm and outline future directions for video foundation models. We make both Step-Video-T2V and Step-Video-T2V-Eval available at https://github.com/stepfun-ai/Step-Video-T2V. The online version can be accessed from https://yuewen.cn/videos as well. Our goal is to accelerate the innovation of video foundation models and empower video content creators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v3-abstract-full').style.display = 'none'; document.getElementById('2502.10248v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09563">arXiv:2502.09563</a> <span> [<a href="https://arxiv.org/pdf/2502.09563">pdf</a>, <a href="https://arxiv.org/format/2502.09563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Self-Calibrating Gaussian Splatting for Large Field of View Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Youming Deng</a>, <a href="/search/cs?searchtype=author&query=Xian%2C+W">Wenqi Xian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guandao Yang</a>, <a href="/search/cs?searchtype=author&query=Guibas%2C+L">Leonidas Guibas</a>, <a href="/search/cs?searchtype=author&query=Wetzstein%2C+G">Gordon Wetzstein</a>, <a href="/search/cs?searchtype=author&query=Marschner%2C+S">Steve Marschner</a>, <a href="/search/cs?searchtype=author&query=Debevec%2C+P">Paul Debevec</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09563v1-abstract-short" style="display: inline;"> In this paper, we present a self-calibrating framework that jointly optimizes camera parameters, lens distortion and 3D Gaussian representations, enabling accurate and efficient scene reconstruction. In particular, our technique enables high-quality scene reconstruction from Large field-of-view (FOV) imagery taken with wide-angle lenses, allowing the scene to be modeled from a smaller number of im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09563v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09563v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09563v1-abstract-full" style="display: none;"> In this paper, we present a self-calibrating framework that jointly optimizes camera parameters, lens distortion and 3D Gaussian representations, enabling accurate and efficient scene reconstruction. In particular, our technique enables high-quality scene reconstruction from Large field-of-view (FOV) imagery taken with wide-angle lenses, allowing the scene to be modeled from a smaller number of images. Our approach introduces a novel method for modeling complex lens distortions using a hybrid network that combines invertible residual networks with explicit grids. This design effectively regularizes the optimization process, achieving greater accuracy than conventional camera models. Additionally, we propose a cubemap-based resampling strategy to support large FOV images without sacrificing resolution or introducing distortion artifacts. Our method is compatible with the fast rasterization of Gaussian Splatting, adaptable to a wide variety of camera lens distortion, and demonstrates state-of-the-art performance on both synthetic and real-world datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09563v1-abstract-full').style.display = 'none'; document.getElementById('2502.09563v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://denghilbert.github.io/self-cali/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07856">arXiv:2502.07856</a> <span> [<a href="https://arxiv.org/pdf/2502.07856">pdf</a>, <a href="https://arxiv.org/format/2502.07856">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MRS: A Fast Sampler for Mean Reverting Diffusion based on ODE and SDE Solvers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+A">Ao Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+W">Wei Fang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Ge Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minfeng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07856v3-abstract-short" style="display: inline;"> In applications of diffusion models, controllable generation is of practical significance, but is also challenging. Current methods for controllable generation primarily focus on modifying the score function of diffusion models, while Mean Reverting (MR) Diffusion directly modifies the structure of the stochastic differential equation (SDE), making the incorporation of image conditions simpler and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07856v3-abstract-full').style.display = 'inline'; document.getElementById('2502.07856v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07856v3-abstract-full" style="display: none;"> In applications of diffusion models, controllable generation is of practical significance, but is also challenging. Current methods for controllable generation primarily focus on modifying the score function of diffusion models, while Mean Reverting (MR) Diffusion directly modifies the structure of the stochastic differential equation (SDE), making the incorporation of image conditions simpler and more natural. However, current training-free fast samplers are not directly applicable to MR Diffusion. And thus MR Diffusion requires hundreds of NFEs (number of function evaluations) to obtain high-quality samples. In this paper, we propose a new algorithm named MRS (MR Sampler) to reduce the sampling NFEs of MR Diffusion. We solve the reverse-time SDE and the probability flow ordinary differential equation (PF-ODE) associated with MR Diffusion, and derive semi-analytical solutions. The solutions consist of an analytical function and an integral parameterized by a neural network. Based on this solution, we can generate high-quality samples in fewer steps. Our approach does not require training and supports all mainstream parameterizations, including noise prediction, data prediction and velocity prediction. Extensive experiments demonstrate that MR Sampler maintains high sampling quality with a speedup of 10 to 20 times across ten different image restoration tasks. Our algorithm accelerates the sampling procedure of MR Diffusion, making it more practical in controllable generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07856v3-abstract-full').style.display = 'none'; document.getElementById('2502.07856v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05282">arXiv:2502.05282</a> <span> [<a href="https://arxiv.org/pdf/2502.05282">pdf</a>, <a href="https://arxiv.org/format/2502.05282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Homeomorphism Prior for False Positive and Negative Problem in Medical Image Dense Contrastive Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuting He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+R">Rongjun Ge</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanyu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05282v1-abstract-short" style="display: inline;"> Dense contrastive representation learning (DCRL) has greatly improved the learning efficiency for image-dense prediction tasks, showing its great potential to reduce the large costs of medical image collection and dense annotation. However, the properties of medical images make unreliable correspondence discovery, bringing an open problem of large-scale false positive and negative (FP&N) pairs in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05282v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05282v1-abstract-full" style="display: none;"> Dense contrastive representation learning (DCRL) has greatly improved the learning efficiency for image-dense prediction tasks, showing its great potential to reduce the large costs of medical image collection and dense annotation. However, the properties of medical images make unreliable correspondence discovery, bringing an open problem of large-scale false positive and negative (FP&N) pairs in DCRL. In this paper, we propose GEoMetric vIsual deNse sImilarity (GEMINI) learning which embeds the homeomorphism prior to DCRL and enables a reliable correspondence discovery for effective dense contrast. We propose a deformable homeomorphism learning (DHL) which models the homeomorphism of medical images and learns to estimate a deformable mapping to predict the pixels' correspondence under topological preservation. It effectively reduces the searching space of pairing and drives an implicit and soft learning of negative pairs via a gradient. We also propose a geometric semantic similarity (GSS) which extracts semantic information in features to measure the alignment degree for the correspondence learning. It will promote the learning efficiency and performance of deformation, constructing positive pairs reliably. We implement two practical variants on two typical representation learning tasks in our experiments. Our promising results on seven datasets which outperform the existing methods show our great superiority. We will release our code on a companion link: https://github.com/YutingHe-list/GEMINI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05282v1-abstract-full').style.display = 'none'; document.getElementById('2502.05282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by T-PAMI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05001">arXiv:2502.05001</a> <span> [<a href="https://arxiv.org/pdf/2502.05001">pdf</a>, <a href="https://arxiv.org/format/2502.05001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A New Paradigm in Tuning Learned Indexes: A Reinforcement Learning Enhanced Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Taiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Liang Liang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Heinis%2C+T">Thomas Heinis</a>, <a href="/search/cs?searchtype=author&query=Yoneki%2C+E">Eiko Yoneki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05001v2-abstract-short" style="display: inline;"> Learned Index Structures (LIS) have significantly advanced data management by leveraging machine learning models to optimize data indexing. However, designing these structures often involves critical trade-offs, making it challenging for both designers and end-users to find an optimal balance tailored to specific workloads and scenarios. While some indexes offer adjustable parameters that demand i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05001v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05001v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05001v2-abstract-full" style="display: none;"> Learned Index Structures (LIS) have significantly advanced data management by leveraging machine learning models to optimize data indexing. However, designing these structures often involves critical trade-offs, making it challenging for both designers and end-users to find an optimal balance tailored to specific workloads and scenarios. While some indexes offer adjustable parameters that demand intensive manual tuning, others rely on fixed configurations based on heuristic auto-tuners or expert knowledge, which may not consistently deliver optimal performance. This paper introduces LITune, a novel framework for end-to-end automatic tuning of Learned Index Structures. LITune employs an adaptive training pipeline equipped with a tailor-made Deep Reinforcement Learning (DRL) approach to ensure stable and efficient tuning. To accommodate long-term dynamics arising from online tuning, we further enhance LITune with an on-the-fly updating mechanism termed the O2 system. These innovations allow LITune to effectively capture state transitions in online tuning scenarios and dynamically adjust to changing data distributions and workloads, marking a significant improvement over other tuning methods. Our experimental results demonstrate that LITune achieves up to a 98% reduction in runtime and a 17-fold increase in throughput compared to default parameter settings given a selected Learned Index instance. These findings highlight LITune's effectiveness and its potential to facilitate broader adoption of LIS in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05001v2-abstract-full').style.display = 'none'; document.getElementById('2502.05001v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02624">arXiv:2502.02624</a> <span> [<a href="https://arxiv.org/pdf/2502.02624">pdf</a>, <a href="https://arxiv.org/format/2502.02624">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Muographic Image Upsampling with Machine Learning for Built Infrastructure Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=O%27Donnell%2C+W">William O'Donnell</a>, <a href="/search/cs?searchtype=author&query=Mahon%2C+D">David Mahon</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guangliang Yang</a>, <a href="/search/cs?searchtype=author&query=Gardner%2C+S">Simon Gardner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02624v1-abstract-short" style="display: inline;"> The civil engineering industry faces a critical need for innovative non-destructive evaluation methods, particularly for ageing critical infrastructure, such as bridges, where current techniques fall short. Muography, a non-invasive imaging technique, constructs three-dimensional density maps by detecting interactions of naturally occurring cosmic-ray muons within the scanned volume. Cosmic-ray mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02624v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02624v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02624v1-abstract-full" style="display: none;"> The civil engineering industry faces a critical need for innovative non-destructive evaluation methods, particularly for ageing critical infrastructure, such as bridges, where current techniques fall short. Muography, a non-invasive imaging technique, constructs three-dimensional density maps by detecting interactions of naturally occurring cosmic-ray muons within the scanned volume. Cosmic-ray muons provide deep penetration and inherent safety due to their high momenta and natural source. However, the technology's reliance on this source results in constrained muon flux, leading to prolonged acquisition times, noisy reconstructions and image interpretation challenges. To address these limitations, we developed a two-model deep learning approach. First, we employed a conditional Wasserstein generative adversarial network with gradient penalty (cWGAN-GP) to perform predictive upsampling of undersampled muography images. Using the structural similarity index measure (SSIM), 1-day sampled images matched the perceptual qualities of a 21-day image, while the peak signal-to-noise ratio (PSNR) indicated noise improvement equivalent to 31 days of sampling. A second cWGAN-GP model, trained for semantic segmentation, quantitatively assessed the upsampling model's impact on concrete sample features. This model achieved segmentation of rebar grids and tendon ducts, with Dice-S酶rensen accuracy coefficients of 0.8174 and 0.8663. Notably, it could mitigate or remove z-plane smearing artifacts caused by muography's inverse imaging problem. Both models were trained on a comprehensive Geant4 Monte-Carlo simulation dataset reflecting realistic civil infrastructure scenarios. Our results demonstrate significant improvements in acquisition speed and image quality, marking a substantial step toward making muography more practical for reinforced concrete infrastructure monitoring applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02624v1-abstract-full').style.display = 'none'; document.getElementById('2502.02624v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18824">arXiv:2501.18824</a> <span> [<a href="https://arxiv.org/pdf/2501.18824">pdf</a>, <a href="https://arxiv.org/format/2501.18824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Memory-Efficient Fine-Tuning of Transformers via Token Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Simoulin%2C+A">Antoine Simoulin</a>, <a href="/search/cs?searchtype=author&query=Park%2C+N">Namyong Park</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Grey Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18824v1-abstract-short" style="display: inline;"> Fine-tuning provides an effective means to specialize pre-trained models for various downstream tasks. However, fine-tuning often incurs high memory overhead, especially for large transformer-based models, such as LLMs. While existing methods may reduce certain parts of the memory required for fine-tuning, they still require caching all intermediate activations computed in the forward pass to upda… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18824v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18824v1-abstract-full" style="display: none;"> Fine-tuning provides an effective means to specialize pre-trained models for various downstream tasks. However, fine-tuning often incurs high memory overhead, especially for large transformer-based models, such as LLMs. While existing methods may reduce certain parts of the memory required for fine-tuning, they still require caching all intermediate activations computed in the forward pass to update weights during the backward pass. In this work, we develop TokenTune, a method to reduce memory usage, specifically the memory to store intermediate activations, in the fine-tuning of transformer-based models. During the backward pass, TokenTune approximates the gradient computation by backpropagating through just a subset of input tokens. Thus, with TokenTune, only a subset of intermediate activations are cached during the forward pass. Also, TokenTune can be easily combined with existing methods like LoRA, further reducing the memory cost. We evaluate our approach on pre-trained transformer models with up to billions of parameters, considering the performance on multiple downstream tasks such as text classification and question answering in a few-shot learning setup. Overall, TokenTune achieves performance on par with full fine-tuning or representative memory-efficient fine-tuning methods, while greatly reducing the memory footprint, especially when combined with other methods with complementary memory reduction mechanisms. We hope that our approach will facilitate the fine-tuning of large transformers, in specializing them for specific domains or co-training them with other neural components from a larger system. Our code is available at https://github.com/facebookresearch/tokentune. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18824v1-abstract-full').style.display = 'none'; document.getElementById('2501.18824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17906">arXiv:2501.17906</a> <span> [<a href="https://arxiv.org/pdf/2501.17906">pdf</a>, <a href="https://arxiv.org/format/2501.17906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Patch-GAN with Targeted Patch Ranking for Fine-Grained Novelty Detection in Medical Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingkun Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jingchao Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianlu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jungong Han</a>, <a href="/search/cs?searchtype=author&query=Grau%2C+V">Vicente Grau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17906v1-abstract-short" style="display: inline;"> Detecting novel anomalies in medical imaging is challenging due to the limited availability of labeled data for rare abnormalities, which often display high variability and subtlety. This challenge is further compounded when small abnormal regions are embedded within larger normal areas, as whole-image predictions frequently overlook these subtle deviations. To address these issues, we propose an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17906v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17906v1-abstract-full" style="display: none;"> Detecting novel anomalies in medical imaging is challenging due to the limited availability of labeled data for rare abnormalities, which often display high variability and subtlety. This challenge is further compounded when small abnormal regions are embedded within larger normal areas, as whole-image predictions frequently overlook these subtle deviations. To address these issues, we propose an unsupervised Patch-GAN framework designed to detect and localize anomalies by capturing both local detail and global structure. Our framework first reconstructs masked images to learn fine-grained, normal-specific features, allowing for enhanced sensitivity to minor deviations from normality. By dividing these reconstructed images into patches and assessing the authenticity of each patch, our approach identifies anomalies at a more granular level, overcoming the limitations of whole-image evaluation. Additionally, a patch-ranking mechanism prioritizes regions with higher abnormal scores, reinforcing the alignment between local patch discrepancies and the global image context. Experimental results on the ISIC 2016 skin lesion and BraTS 2019 brain tumor datasets validate our framework's effectiveness, achieving AUCs of 95.79% and 96.05%, respectively, and outperforming three state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17906v1-abstract-full').style.display = 'none'; document.getElementById('2501.17906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17711">arXiv:2501.17711</a> <span> [<a href="https://arxiv.org/pdf/2501.17711">pdf</a>, <a href="https://arxiv.org/format/2501.17711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> STGCN-LSTM for Olympic Medal Prediction: Dynamic Power Modeling and Causal Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiquan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaying Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tin-Yeh Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingyi Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanjie Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zihao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17711v2-abstract-short" style="display: inline;"> This paper proposes a novel hybrid model, STGCN-LSTM, to forecast Olympic medal distributions by integrating the spatio-temporal relationships among countries and the long-term dependencies of national performance. The Spatial-Temporal Graph Convolution Network (STGCN) captures geographic and interactive factors-such as coaching exchange and socio-economic links-while the Long Short-Term Memory (L… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17711v2-abstract-full').style.display = 'inline'; document.getElementById('2501.17711v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17711v2-abstract-full" style="display: none;"> This paper proposes a novel hybrid model, STGCN-LSTM, to forecast Olympic medal distributions by integrating the spatio-temporal relationships among countries and the long-term dependencies of national performance. The Spatial-Temporal Graph Convolution Network (STGCN) captures geographic and interactive factors-such as coaching exchange and socio-economic links-while the Long Short-Term Memory (LSTM) module models historical trends in medal counts, economic data, and demographics. To address zero-inflated outputs (i.e., the disparity between countries that consistently yield wins and those never having won medals), a Zero-Inflated Compound Poisson (ZICP) framework is incorporated to separate random zeros from structural zeros, providing a clearer view of potential breakthrough performances. Validation includes historical backtracking, policy shock simulations, and causal inference checks, confirming the robustness of the proposed method. Results shed light on the influence of coaching mobility, event specialization, and strategic investment on medal forecasts, offering a data-driven foundation for optimizing sports policies and resource allocation in diverse Olympic contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17711v2-abstract-full').style.display = 'none'; document.getElementById('2501.17711v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18pages, 7figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17547">arXiv:2501.17547</a> <span> [<a href="https://arxiv.org/pdf/2501.17547">pdf</a>, <a href="https://arxiv.org/format/2501.17547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Training-Free Open-World Classification with 3D Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xinzhe Xia</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Weiguang Zhao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuyao Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanyu Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaizhu Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17547v1-abstract-short" style="display: inline;"> 3D open-world classification is a challenging yet essential task in dynamic and unstructured real-world scenarios, requiring both open-category and open-pose recognition. To address these challenges, recent wisdom often takes sophisticated 2D pre-trained models to provide enriched and stable representations. However, these methods largely rely on how 3D objects can be projected into 2D space, whic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17547v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17547v1-abstract-full" style="display: none;"> 3D open-world classification is a challenging yet essential task in dynamic and unstructured real-world scenarios, requiring both open-category and open-pose recognition. To address these challenges, recent wisdom often takes sophisticated 2D pre-trained models to provide enriched and stable representations. However, these methods largely rely on how 3D objects can be projected into 2D space, which is unfortunately not well solved, and thus significantly limits their performance. Unlike these present efforts, in this paper we make a pioneering exploration of 3D generative models for 3D open-world classification. Drawing on abundant prior knowledge from 3D generative models, we additionally craft a rotation-invariant feature extractor. This innovative synergy endows our pipeline with the advantages of being training-free, open-category, and pose-invariant, thus well suited to 3D open-world classification. Extensive experiments on benchmark datasets demonstrate the potential of generative models in 3D open-world classification, achieving state-of-the-art performance on ModelNet10 and McGill with 32.0% and 8.7% overall accuracy improvement, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17547v1-abstract-full').style.display = 'none'; document.getElementById('2501.17547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16778">arXiv:2501.16778</a> <span> [<a href="https://arxiv.org/pdf/2501.16778">pdf</a>, <a href="https://arxiv.org/format/2501.16778">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FlexMotion: Lightweight, Physics-Aware, and Controllable Human Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tashakori%2C+A">Arvin Tashakori</a>, <a href="/search/cs?searchtype=author&query=Tashakori%2C+A">Arash Tashakori</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gongbo Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z+J">Z. Jane Wang</a>, <a href="/search/cs?searchtype=author&query=Servati%2C+P">Peyman Servati</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16778v1-abstract-short" style="display: inline;"> Lightweight, controllable, and physically plausible human motion synthesis is crucial for animation, virtual reality, robotics, and human-computer interaction applications. Existing methods often compromise between computational efficiency, physical realism, or spatial controllability. We propose FlexMotion, a novel framework that leverages a computationally lightweight diffusion model operating i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16778v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16778v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16778v1-abstract-full" style="display: none;"> Lightweight, controllable, and physically plausible human motion synthesis is crucial for animation, virtual reality, robotics, and human-computer interaction applications. Existing methods often compromise between computational efficiency, physical realism, or spatial controllability. We propose FlexMotion, a novel framework that leverages a computationally lightweight diffusion model operating in the latent space, eliminating the need for physics simulators and enabling fast and efficient training. FlexMotion employs a multimodal pre-trained Transformer encoder-decoder, integrating joint locations, contact forces, joint actuations and muscle activations to ensure the physical plausibility of the generated motions. FlexMotion also introduces a plug-and-play module, which adds spatial controllability over a range of motion parameters (e.g., joint locations, joint actuations, contact forces, and muscle activations). Our framework achieves realistic motion generation with improved efficiency and control, setting a new benchmark for human motion synthesis. We evaluate FlexMotion on extended datasets and demonstrate its superior performance in terms of realism, physical plausibility, and controllability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16778v1-abstract-full').style.display = 'none'; document.getElementById('2501.16778v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14929">arXiv:2501.14929</a> <span> [<a href="https://arxiv.org/pdf/2501.14929">pdf</a>, <a href="https://arxiv.org/format/2501.14929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Motion-enhancement to Echocardiography Segmentation via Inserting a Temporal Attention Module: An Efficient, Adaptable, and Scalable Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hasan%2C+M+K">Md. Kamrul Hasan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Yap%2C+C+H">Choon Hwai Yap</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14929v1-abstract-short" style="display: inline;"> Cardiac anatomy segmentation is essential for clinical assessment of cardiac function and disease diagnosis to inform treatment and intervention. In performing segmentation, deep learning (DL) algorithms improved accuracy significantly compared to traditional image processing approaches. More recently, studies showed that enhancing DL segmentation with motion information can further improve it. A… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14929v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14929v1-abstract-full" style="display: none;"> Cardiac anatomy segmentation is essential for clinical assessment of cardiac function and disease diagnosis to inform treatment and intervention. In performing segmentation, deep learning (DL) algorithms improved accuracy significantly compared to traditional image processing approaches. More recently, studies showed that enhancing DL segmentation with motion information can further improve it. A range of methods for injecting motion information has been proposed, but many of them increase the dimensionality of input images (which is computationally expensive) or have not used an optimal method to insert motion information, such as non-DL registration, non-attention-based networks or single-headed attention. Here, we present a novel, computation-efficient alternative where a novel, scalable temporal attention module (TAM) extracts temporal feature interactions multiple times and where TAM has a multi-headed, KQV projection cross-attention architecture. The module can be seamlessly integrated into a wide range of existing CNN- or Transformer-based networks, providing novel flexibility for inclusion in future implementations. Extensive evaluations on different cardiac datasets, 2D echocardiography (CAMUS), and 3D echocardiography (MITEA) demonstrate the model's effectiveness when integrated into well-established backbone networks like UNet, FCN8s, UNetR, SwinUNetR, and the recent I2UNet. We further find that the optimized TAM-enhanced FCN8s network performs well compared to contemporary alternatives. Our results confirm TAM's robustness, scalability, and generalizability across diverse datasets and backbones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14929v1-abstract-full').style.display = 'none'; document.getElementById('2501.14929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12016">arXiv:2501.12016</a> <span> [<a href="https://arxiv.org/pdf/2501.12016">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Are Traditional Deep Learning Model Approaches as Effective as a Retinal-Specific Foundation Model for Ocular and Systemic Disease Detection? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yew%2C+S+M+E">Samantha Min Er Yew</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+X">Xiaofeng Lei</a>, <a href="/search/cs?searchtype=author&query=Goh%2C+J+H+L">Jocelyn Hui Lin Goh</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yibing Chen</a>, <a href="/search/cs?searchtype=author&query=Srinivasan%2C+S">Sahana Srinivasan</a>, <a href="/search/cs?searchtype=author&query=Chee%2C+M">Miao-li Chee</a>, <a href="/search/cs?searchtype=author&query=Pushpanathan%2C+K">Krithi Pushpanathan</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+K">Ke Zou</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Q">Qingshan Hou</a>, <a href="/search/cs?searchtype=author&query=Da+Soh%2C+Z">Zhi Da Soh</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+C">Cancan Xue</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+M+C+Y">Marco Chak Yan Yu</a>, <a href="/search/cs?searchtype=author&query=Sabanayagam%2C+C">Charumathi Sabanayagam</a>, <a href="/search/cs?searchtype=author&query=Tai%2C+E+S">E Shyong Tai</a>, <a href="/search/cs?searchtype=author&query=Sim%2C+X">Xueling Sim</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaxing Wang</a>, <a href="/search/cs?searchtype=author&query=Jonas%2C+J+B">Jost B. Jonas</a>, <a href="/search/cs?searchtype=author&query=Nangia%2C+V">Vinay Nangia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G+D">Gabriel Dawei Yang</a>, <a href="/search/cs?searchtype=author&query=Ran%2C+E+A">Emma Anran Ran</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+C+Y">Carol Yim-Lui Cheung</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yangqin Feng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Goh%2C+R+S+M">Rick Siow Mong Goh</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yukun Zhou</a> , et al. (4 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12016v1-abstract-short" style="display: inline;"> Background: RETFound, a self-supervised, retina-specific foundation model (FM), showed potential in downstream applications. However, its comparative performance with traditional deep learning (DL) models remains incompletely understood. This study aimed to evaluate RETFound against three ImageNet-pretrained supervised DL models (ResNet50, ViT-base, SwinV2) in detecting ocular and systemic disease… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12016v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12016v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12016v1-abstract-full" style="display: none;"> Background: RETFound, a self-supervised, retina-specific foundation model (FM), showed potential in downstream applications. However, its comparative performance with traditional deep learning (DL) models remains incompletely understood. This study aimed to evaluate RETFound against three ImageNet-pretrained supervised DL models (ResNet50, ViT-base, SwinV2) in detecting ocular and systemic diseases. Methods: We fine-tuned/trained RETFound and three DL models on full datasets, 50%, 20%, and fixed sample sizes (400, 200, 100 images, with half comprising disease cases; for each DR severity class, 100 and 50 cases were used. Fine-tuned models were tested internally using the SEED (53,090 images) and APTOS-2019 (3,672 images) datasets and externally validated on population-based (BES, CIEMS, SP2, UKBB) and open-source datasets (ODIR-5k, PAPILA, GAMMA, IDRiD, MESSIDOR-2). Model performance was compared using area under the receiver operating characteristic curve (AUC) and Z-tests with Bonferroni correction (P<0.05/3). Interpretation: Traditional DL models are mostly comparable to RETFound for ocular disease detection with large datasets. However, RETFound is superior in systemic disease detection with smaller datasets. These findings offer valuable insights into the respective merits and limitation of traditional models and FMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12016v1-abstract-full').style.display = 'none'; document.getElementById('2501.12016v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10920">arXiv:2501.10920</a> <span> [<a href="https://arxiv.org/pdf/2501.10920">pdf</a>, <a href="https://arxiv.org/format/2501.10920">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Data Enrichment Opportunities for Distribution Grid Cable Networks using Variational Autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sundsgaard%2C+K">Konrad Sundsgaard</a>, <a href="/search/cs?searchtype=author&query=B%C3%B6lat%2C+K">Kutay B枚lat</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guangya Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10920v1-abstract-short" style="display: inline;"> Electricity distribution cable networks suffer from incomplete and unbalanced data, hindering the effectiveness of machine learning models for predictive maintenance and reliability evaluation. Features such as the installation date of the cables are frequently missing. To address data scarcity, this study investigates the application of Variational Autoencoders (VAEs) for data enrichment, synthet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10920v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10920v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10920v1-abstract-full" style="display: none;"> Electricity distribution cable networks suffer from incomplete and unbalanced data, hindering the effectiveness of machine learning models for predictive maintenance and reliability evaluation. Features such as the installation date of the cables are frequently missing. To address data scarcity, this study investigates the application of Variational Autoencoders (VAEs) for data enrichment, synthetic data generation, imbalanced data handling, and outlier detection. Based on a proof-of-concept case study for Denmark, targeting the imputation of missing age information in cable network asset registers, the analysis underlines the potential of generative models to support data-driven maintenance. However, the study also highlights several areas for improvement, including enhanced feature importance analysis, incorporating network characteristics and external features, and handling biases in missing data. Future initiatives should expand the application of VAEs by incorporating semi-supervised learning, advanced sampling techniques, and additional distribution grid elements, including low-voltage networks, into the analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10920v1-abstract-full').style.display = 'none'; document.getElementById('2501.10920v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10705">arXiv:2501.10705</a> <span> [<a href="https://arxiv.org/pdf/2501.10705">pdf</a>, <a href="https://arxiv.org/format/2501.10705">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Secure Communication in Dynamic RDARS-Driven Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pei%2C+Z">Ziqian Pei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jintao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pingping Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zheng Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanghua Yang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shaodan Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10705v1-abstract-short" style="display: inline;"> In this letter, we investigate a dynamic reconfigurable distributed antenna and reflection surface (RDARS)-driven secure communication system, where the working mode of the RDARS can be flexibly configured. We aim to maximize the secrecy rate by jointly designing the active beamforming vectors, reflection coefficients, and the channel-aware mode selection matrix. To address the non-convex binary a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10705v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10705v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10705v1-abstract-full" style="display: none;"> In this letter, we investigate a dynamic reconfigurable distributed antenna and reflection surface (RDARS)-driven secure communication system, where the working mode of the RDARS can be flexibly configured. We aim to maximize the secrecy rate by jointly designing the active beamforming vectors, reflection coefficients, and the channel-aware mode selection matrix. To address the non-convex binary and cardinality constraints introduced by dynamic mode selection, we propose an efficient alternating optimization (AO) framework that employs penalty-based fractional programming (FP) and successive convex approximation (SCA) transformations. Simulation results demonstrate the potential of RDARS in enhancing the secrecy rate and show its superiority compared to existing reflection surface-based schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10705v1-abstract-full').style.display = 'none'; document.getElementById('2501.10705v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06282">arXiv:2501.06282</a> <span> [<a href="https://arxiv.org/pdf/2501.06282">pdf</a>, <a href="https://arxiv.org/format/2501.06282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MinMo: A Multimodal Large Language Model for Seamless Voice Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanni Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mengzhe Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingda Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+R">Ruize Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yabin Li</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Bin Ma</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jialong Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yunlan Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fan Yu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06282v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06282v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence lengths and insufficient pre-training. Aligned models maintain text LLM capabilities but are often limited by small datasets and a narrow focus on speech tasks. In this work, we introduce MinMo, a Multimodal Large Language Model with approximately 8B parameters for seamless voice interaction. We address the main limitations of prior aligned multimodal models. We train MinMo through multiple stages of speech-to-text alignment, text-to-speech alignment, speech-to-speech alignment, and duplex interaction alignment, on 1.4 million hours of diverse speech data and a broad range of speech tasks. After the multi-stage training, MinMo achieves state-of-the-art performance across various benchmarks for voice comprehension and generation while maintaining the capabilities of text LLMs, and also facilitates full-duplex conversation, that is, simultaneous two-way communication between the user and the system. Moreover, we propose a novel and simple voice decoder that outperforms prior models in voice generation. The enhanced instruction-following capabilities of MinMo supports controlling speech generation based on user instructions, with various nuances including emotions, dialects, and speaking rates, and mimicking specific voices. For MinMo, the speech-to-text latency is approximately 100ms, full-duplex latency is approximately 600ms in theory and 800ms in practice. The MinMo project web page is https://funaudiollm.github.io/minmo, and the code and models will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'none'; document.getElementById('2501.06282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05241">arXiv:2501.05241</a> <span> [<a href="https://arxiv.org/pdf/2501.05241">pdf</a>, <a href="https://arxiv.org/format/2501.05241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Contrast-Free Myocardial Scar Segmentation in Cine MRI using Motion and Texture Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingkun Chen</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+X">Xicheng Sheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+X">Xiahai Zhuang</a>, <a href="/search/cs?searchtype=author&query=Raman%2C+B">Betty Raman</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&query=Grau%2C+V">Vicente Grau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05241v1-abstract-short" style="display: inline;"> Late gadolinium enhancement MRI (LGE MRI) is the gold standard for the detection of myocardial scars for post myocardial infarction (MI). LGE MRI requires the injection of a contrast agent, which carries potential side effects and increases scanning time and patient discomfort. To address these issues, we propose a novel framework that combines cardiac motion observed in cine MRI with image textur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05241v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05241v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05241v1-abstract-full" style="display: none;"> Late gadolinium enhancement MRI (LGE MRI) is the gold standard for the detection of myocardial scars for post myocardial infarction (MI). LGE MRI requires the injection of a contrast agent, which carries potential side effects and increases scanning time and patient discomfort. To address these issues, we propose a novel framework that combines cardiac motion observed in cine MRI with image texture information to segment the myocardium and scar tissue in the left ventricle. Cardiac motion tracking can be formulated as a full cardiac image cycle registration problem, which can be solved via deep neural networks. Experimental results prove that the proposed method can achieve scar segmentation based on non-contrasted cine images with comparable accuracy to LGE MRI. This demonstrates its potential as an alternative to contrast-enhanced techniques for scar detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05241v1-abstract-full').style.display = 'none'; document.getElementById('2501.05241v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2figs, 2tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04487">arXiv:2501.04487</a> <span> [<a href="https://arxiv.org/pdf/2501.04487">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Integrating remote sensing data assimilation, deep learning and large language model for interactive wheat breeding yield prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guofeng Yang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+N">Nanfei Jin</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+W">Wenjie Ai</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhonghua Zheng</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuhong He</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04487v1-abstract-short" style="display: inline;"> Yield is one of the core goals of crop breeding. By predicting the potential yield of different breeding materials, breeders can screen these materials at various growth stages to select the best performing. Based on unmanned aerial vehicle remote sensing technology, high-throughput crop phenotyping data in breeding areas is collected to provide data support for the breeding decisions of breeders.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04487v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04487v1-abstract-full" style="display: none;"> Yield is one of the core goals of crop breeding. By predicting the potential yield of different breeding materials, breeders can screen these materials at various growth stages to select the best performing. Based on unmanned aerial vehicle remote sensing technology, high-throughput crop phenotyping data in breeding areas is collected to provide data support for the breeding decisions of breeders. However, the accuracy of current yield predictions still requires improvement, and the usability and user-friendliness of yield forecasting tools remain suboptimal. To address these challenges, this study introduces a hybrid method and tool for crop yield prediction, designed to allow breeders to interactively and accurately predict wheat yield by chatting with a large language model (LLM). First, the newly designed data assimilation algorithm is used to assimilate the leaf area index into the WOFOST model. Then, selected outputs from the assimilation process, along with remote sensing inversion results, are used to drive the time-series temporal fusion transformer model for wheat yield prediction. Finally, based on this hybrid method and leveraging an LLM with retrieval augmented generation technology, we developed an interactive yield prediction Web tool that is user-friendly and supports sustainable data updates. This tool integrates multi-source data to assist breeding decision-making. This study aims to accelerate the identification of high-yield materials in the breeding process, enhance breeding efficiency, and enable more scientific and smart breeding decisions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04487v1-abstract-full').style.display = 'none'; document.getElementById('2501.04487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20390">arXiv:2412.20390</a> <span> [<a href="https://arxiv.org/pdf/2412.20390">pdf</a>, <a href="https://arxiv.org/format/2412.20390">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MetricDepth: Enhancing Monocular Depth Estimation with Deep Metric Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunpu Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanglei Yang</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+W">Wangmeng Zuo</a>, <a href="/search/cs?searchtype=author&query=Zan%2C+T">Tianyi Zan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20390v1-abstract-short" style="display: inline;"> Deep metric learning aims to learn features relying on the consistency or divergence of class labels. However, in monocular depth estimation, the absence of a natural definition of class poses challenges in the leveraging of deep metric learning. Addressing this gap, this paper introduces MetricDepth, a novel method that integrates deep metric learning to enhance the performance of monocular depth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20390v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20390v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20390v1-abstract-full" style="display: none;"> Deep metric learning aims to learn features relying on the consistency or divergence of class labels. However, in monocular depth estimation, the absence of a natural definition of class poses challenges in the leveraging of deep metric learning. Addressing this gap, this paper introduces MetricDepth, a novel method that integrates deep metric learning to enhance the performance of monocular depth estimation. To overcome the inapplicability of the class-based sample identification in previous deep metric learning methods to monocular depth estimation task, we design the differential-based sample identification. This innovative approach identifies feature samples as different sample types by their depth differentials relative to anchor, laying a foundation for feature regularizing in monocular depth estimation models. Building upon this advancement, we then address another critical problem caused by the vast range and the continuity of depth annotations in monocular depth estimation. The extensive and continuous annotations lead to the diverse differentials of negative samples to anchor feature, representing the varied impact of negative samples during feature regularizing. Recognizing the inadequacy of the uniform strategy in previous deep metric learning methods for handling negative samples in monocular depth estimation task, we propose the multi-range strategy. Through further distinction on negative samples according to depth differential ranges and implementation of diverse regularizing, our multi-range strategy facilitates differentiated regularization interactions between anchor feature and its negative samples. Experiments across various datasets and model types demonstrate the effectiveness and versatility of MetricDepth,confirming its potential for performance enhancement in monocular depth estimation task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20390v1-abstract-full').style.display = 'none'; document.getElementById('2412.20390v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20171">arXiv:2412.20171</a> <span> [<a href="https://arxiv.org/pdf/2412.20171">pdf</a>, <a href="https://arxiv.org/format/2412.20171">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Geo-ConvGRU: Geographically Masked Convolutional Gated Recurrent Unit for Bird-Eye View Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanglei Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wanlong Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yu Tang</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+W">Weize Shang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+F">Feng Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+M">Mingli Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20171v1-abstract-short" style="display: inline;"> Convolutional Neural Networks (CNNs) have significantly impacted various computer vision tasks, however, they inherently struggle to model long-range dependencies explicitly due to the localized nature of convolution operations. Although Transformers have addressed limitations in long-range dependencies for the spatial dimension, the temporal dimension remains underexplored. In this paper, we firs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20171v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20171v1-abstract-full" style="display: none;"> Convolutional Neural Networks (CNNs) have significantly impacted various computer vision tasks, however, they inherently struggle to model long-range dependencies explicitly due to the localized nature of convolution operations. Although Transformers have addressed limitations in long-range dependencies for the spatial dimension, the temporal dimension remains underexplored. In this paper, we first highlight that 3D CNNs exhibit limitations in capturing long-range temporal dependencies. Though Transformers mitigate spatial dimension issues, they result in a considerable increase in parameter and processing speed reduction. To overcome these challenges, we introduce a simple yet effective module, Geographically Masked Convolutional Gated Recurrent Unit (Geo-ConvGRU), tailored for Bird's-Eye View segmentation. Specifically, we substitute the 3D CNN layers with ConvGRU in the temporal module to bolster the capacity of networks for handling temporal dependencies. Additionally, we integrate a geographical mask into the Convolutional Gated Recurrent Unit to suppress noise introduced by the temporal module. Comprehensive experiments conducted on the NuScenes dataset substantiate the merits of the proposed Geo-ConvGRU, revealing that our approach attains state-of-the-art performance in Bird's-Eye View segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20171v1-abstract-full').style.display = 'none'; document.getElementById('2412.20171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20162">arXiv:2412.20162</a> <span> [<a href="https://arxiv.org/pdf/2412.20162">pdf</a>, <a href="https://arxiv.org/format/2412.20162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Modality Driven LoRA for Adverse Condition Depth Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanglei Yang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+R">Rui Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zhun Zhong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongqiang Li</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+W">Wangmeng Zuo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20162v1-abstract-short" style="display: inline;"> The autonomous driving community is increasingly focused on addressing corner case problems, particularly those related to ensuring driving safety under adverse conditions (e.g., nighttime, fog, rain). To this end, the task of Adverse Condition Depth Estimation (ACDE) has gained significant attention. Previous approaches in ACDE have primarily relied on generative models, which necessitate additio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20162v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20162v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20162v1-abstract-full" style="display: none;"> The autonomous driving community is increasingly focused on addressing corner case problems, particularly those related to ensuring driving safety under adverse conditions (e.g., nighttime, fog, rain). To this end, the task of Adverse Condition Depth Estimation (ACDE) has gained significant attention. Previous approaches in ACDE have primarily relied on generative models, which necessitate additional target images to convert the sunny condition into adverse weather, or learnable parameters for feature augmentation to adapt domain gaps, resulting in increased model complexity and tuning efforts. Furthermore, unlike CLIP-based methods where textual and visual features have been pre-aligned, depth estimation models lack sufficient alignment between multimodal features, hindering coherent understanding under adverse conditions. To address these limitations, we propose Multi-Modality Driven LoRA (MMD-LoRA), which leverages low-rank adaptation matrices for efficient fine-tuning from source-domain to target-domain. It consists of two core components: Prompt Driven Domain Alignment (PDDA) and Visual-Text Consistent Contrastive Learning(VTCCL). During PDDA, the image encoder with MMD-LoRA generates target-domain visual representations, supervised by alignment loss that the source-target difference between language and image should be equal. Meanwhile, VTCCL bridges the gap between textual features from CLIP and visual features from diffusion model, pushing apart different weather representations (vision and text) and bringing together similar ones. Through extensive experiments, the proposed method achieves state-of-the-art performance on the nuScenes and Oxford RobotCar datasets, underscoring robustness and efficiency in adapting to varied adverse environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20162v1-abstract-full').style.display = 'none'; document.getElementById('2412.20162v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19547">arXiv:2412.19547</a> <span> [<a href="https://arxiv.org/pdf/2412.19547">pdf</a>, <a href="https://arxiv.org/format/2412.19547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unprejudiced Training Auxiliary Tasks Makes Primary Better: A Multi-Task Learning Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuanze Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chun-Mei Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qilong Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanglei Yang</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+W">Wangmeng Zuo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19547v1-abstract-short" style="display: inline;"> Human beings can leverage knowledge from relative tasks to improve learning on a primary task. Similarly, multi-task learning methods suggest using auxiliary tasks to enhance a neural network's performance on a specific primary task. However, previous methods often select auxiliary tasks carefully but treat them as secondary during training. The weights assigned to auxiliary losses are typically s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19547v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19547v1-abstract-full" style="display: none;"> Human beings can leverage knowledge from relative tasks to improve learning on a primary task. Similarly, multi-task learning methods suggest using auxiliary tasks to enhance a neural network's performance on a specific primary task. However, previous methods often select auxiliary tasks carefully but treat them as secondary during training. The weights assigned to auxiliary losses are typically smaller than the primary loss weight, leading to insufficient training on auxiliary tasks and ultimately failing to support the main task effectively. To address this issue, we propose an uncertainty-based impartial learning method that ensures balanced training across all tasks. Additionally, we consider both gradients and uncertainty information during backpropagation to further improve performance on the primary task. Extensive experiments show that our method achieves performance comparable to or better than state-of-the-art approaches. Moreover, our weighting strategy is effective and robust in enhancing the performance of the primary task regardless the noise auxiliary tasks' pseudo labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19547v1-abstract-full').style.display = 'none'; document.getElementById('2412.19547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17979">arXiv:2412.17979</a> <span> [<a href="https://arxiv.org/pdf/2412.17979">pdf</a>, <a href="https://arxiv.org/format/2412.17979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> PHICOIN (PHI): The Proof of Work High-Performance Infrastructure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Trinh%2C+P">Peter Trinh</a>, <a href="/search/cs?searchtype=author&query=Iqbal%2C+S">Sannan Iqbal</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Justin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17979v1-abstract-short" style="display: inline;"> PHICOIN (PHI) is a high-performance cryptocurrency based on the Proof-of-Work (PoW) mechanism. It aims to provide ordinary users with decentralized participation opportunities through an improved and innovative mining algorithm and fair design principles. PHI addresses the challenges of centralization in cryptocurrency mining by enhancing resistance to ASIC and FPGA devices and promoting fair part… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17979v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17979v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17979v1-abstract-full" style="display: none;"> PHICOIN (PHI) is a high-performance cryptocurrency based on the Proof-of-Work (PoW) mechanism. It aims to provide ordinary users with decentralized participation opportunities through an improved and innovative mining algorithm and fair design principles. PHI addresses the challenges of centralization in cryptocurrency mining by enhancing resistance to ASIC and FPGA devices and promoting fair participation. This paper outlines the technical specifications, mission, and roadmap for PHI, highlighting its potential to become a foundational infrastructure for PoW cryptocurrencies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17979v1-abstract-full').style.display = 'none'; document.getElementById('2412.17979v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15979">arXiv:2412.15979</a> <span> [<a href="https://arxiv.org/pdf/2412.15979">pdf</a>, <a href="https://arxiv.org/format/2412.15979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MR-GDINO: Efficient Open-World Continual Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+B">Bowen Dong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zitong Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanglei Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+W">Wangmeng Zuo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15979v2-abstract-short" style="display: inline;"> Open-world (OW) recognition and detection models show strong zero- and few-shot adaptation abilities, inspiring their use as initializations in continual learning methods to improve performance. Despite promising results on seen classes, such OW abilities on unseen classes are largely degenerated due to catastrophic forgetting. To tackle this challenge, we propose an open-world continual object de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15979v2-abstract-full').style.display = 'inline'; document.getElementById('2412.15979v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15979v2-abstract-full" style="display: none;"> Open-world (OW) recognition and detection models show strong zero- and few-shot adaptation abilities, inspiring their use as initializations in continual learning methods to improve performance. Despite promising results on seen classes, such OW abilities on unseen classes are largely degenerated due to catastrophic forgetting. To tackle this challenge, we propose an open-world continual object detection task, requiring detectors to generalize to old, new, and unseen categories in continual learning scenarios. Based on this task, we present a challenging yet practical OW-COD benchmark to assess detection abilities. The goal is to motivate OW detectors to simultaneously preserve learned classes, adapt to new classes, and maintain open-world capabilities under few-shot adaptations. To mitigate forgetting in unseen categories, we propose MR-GDINO, a strong, efficient and scalable baseline via memory and retrieval mechanisms within a highly scalable memory pool. Experimental results show that existing continual detectors suffer from severe forgetting for both seen and unseen categories. In contrast, MR-GDINO largely mitigates forgetting with only 0.1% activated extra parameters, achieving state-of-the-art performance for old, new, and unseen categories. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15979v2-abstract-full').style.display = 'none'; document.getElementById('2412.15979v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://m1saka.moe/owcod/ . Code is available at: https://github.com/DongSky/MR-GDINO</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15921">arXiv:2412.15921</a> <span> [<a href="https://arxiv.org/pdf/2412.15921">pdf</a>, <a href="https://arxiv.org/format/2412.15921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Less is More: Towards Green Code Large Language Models via Unified Structural Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Ke Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+T+Y">Terry Yue Zhuo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Taolue Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15921v1-abstract-short" style="display: inline;"> The extensive application of Large Language Models (LLMs) in generative coding tasks has raised concerns due to their high computational demands and energy consumption. Unlike previous structural pruning methods designed for classification models that deal with lowdimensional classification logits, generative Code LLMs produce high-dimensional token logit sequences, making traditional pruning obje… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15921v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15921v1-abstract-full" style="display: none;"> The extensive application of Large Language Models (LLMs) in generative coding tasks has raised concerns due to their high computational demands and energy consumption. Unlike previous structural pruning methods designed for classification models that deal with lowdimensional classification logits, generative Code LLMs produce high-dimensional token logit sequences, making traditional pruning objectives inherently limited. Moreover, existing single component pruning approaches further constrain the effectiveness when applied to generative Code LLMs. In response, we propose Flab-Pruner, an innovative unified structural pruning method that combines vocabulary, layer, and Feed-Forward Network (FFN) pruning. This approach effectively reduces model parameters while maintaining performance. Additionally, we introduce a customized code instruction data strategy for coding tasks to enhance the performance recovery efficiency of the pruned model. Through extensive evaluations on three state-of-the-art Code LLMs across multiple generative coding tasks, the results demonstrate that Flab-Pruner retains 97% of the original performance after pruning 22% of the parameters and achieves the same or even better performance after post-training. The pruned models exhibit significant improvements in storage, GPU usage, computational efficiency, and environmental impact, while maintaining well robustness. Our research provides a sustainable solution for green software engineering and promotes the efficient deployment of LLMs in real-world generative coding intelligence applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15921v1-abstract-full').style.display = 'none'; document.getElementById('2412.15921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">UNDER REVIEW</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13877">arXiv:2412.13877</a> <span> [<a href="https://arxiv.org/pdf/2412.13877">pdf</a>, <a href="https://arxiv.org/format/2412.13877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RoboMIND: Benchmark on Multi-embodiment Intelligence Normative Data for Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kun Wu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+C">Chengkai Hou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a>, <a href="/search/cs?searchtype=author&query=Che%2C+Z">Zhengping Che</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+X">Xiaozhu Ju</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuqin Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yinuo Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+S">Shichao Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinhua Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+F">Fei Liao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhen Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guangyu Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhao Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lecheng Wang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jilei Mao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ning Liu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+P">Pei Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yaoxu Lyu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengzhen Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jingyang He</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yulin Luo</a> , et al. (12 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13877v2-abstract-short" style="display: inline;"> In this paper, we introduce RoboMIND (Multi-embodiment Intelligence Normative Data for Robot Manipulation), a dataset containing 107k demonstration trajectories across 479 diverse tasks involving 96 object classes. RoboMIND is collected through human teleoperation and encompasses comprehensive robotic-related information, including multi-view observations, proprioceptive robot state information, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13877v2-abstract-full').style.display = 'inline'; document.getElementById('2412.13877v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13877v2-abstract-full" style="display: none;"> In this paper, we introduce RoboMIND (Multi-embodiment Intelligence Normative Data for Robot Manipulation), a dataset containing 107k demonstration trajectories across 479 diverse tasks involving 96 object classes. RoboMIND is collected through human teleoperation and encompasses comprehensive robotic-related information, including multi-view observations, proprioceptive robot state information, and linguistic task descriptions. To ensure data consistency and reliability for imitation learning, RoboMIND is built on a unified data collection platform and a standardized protocol, covering four distinct robotic embodiments: the Franka Emika Panda, the UR5e, the AgileX dual-arm robot, and a humanoid robot with dual dexterous hands. Our dataset also includes 5k real-world failure demonstrations, each accompanied by detailed causes, enabling failure reflection and correction during policy learning. Additionally, we created a digital twin environment in the Isaac Sim simulator, replicating the real-world tasks and assets, which facilitates the low-cost collection of additional training data and enables efficient evaluation. To demonstrate the quality and diversity of our dataset, we conducted extensive experiments using various imitation learning methods for single-task settings and state-of-the-art Vision-Language-Action (VLA) models for multi-task scenarios. By leveraging RoboMIND, the VLA models achieved high manipulation success rates and demonstrated strong generalization capabilities. To the best of our knowledge, RoboMIND is the largest multi-embodiment teleoperation dataset collected on a unified platform, providing large-scale and high-quality robotic training data. Our project is at https://x-humanoid-robomind.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13877v2-abstract-full').style.display = 'none'; document.getElementById('2412.13877v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13196">arXiv:2412.13196</a> <span> [<a href="https://arxiv.org/pdf/2412.13196">pdf</a>, <a href="https://arxiv.org/format/2412.13196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ExBody2: Advanced Expressive Humanoid Whole-Body Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+M">Mazeyu Ji</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xuanbin Peng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fangchen Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialong Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Ge Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xuxin Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13196v1-abstract-short" style="display: inline;"> This paper enables real-world humanoid robots to maintain stability while performing expressive motions like humans do. We propose ExBody2, a generalized whole-body tracking framework that can take any reference motion inputs and control the humanoid to mimic the motion. The model is trained in simulation with Reinforcement Learning and then transferred to the real world. It decouples keypoint tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13196v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13196v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13196v1-abstract-full" style="display: none;"> This paper enables real-world humanoid robots to maintain stability while performing expressive motions like humans do. We propose ExBody2, a generalized whole-body tracking framework that can take any reference motion inputs and control the humanoid to mimic the motion. The model is trained in simulation with Reinforcement Learning and then transferred to the real world. It decouples keypoint tracking with velocity control, and effectively leverages a privileged teacher policy to distill precise mimic skills into the target student policy, which enables high-fidelity replication of dynamic movements such as running, crouching, dancing, and other challenging motions. We present a comprehensive qualitative and quantitative analysis of crucial design factors in the paper. We conduct our experiments on two humanoid platforms and demonstrate the superiority of our approach against state-of-the-arts, providing practical guidelines to pursue the extreme of whole-body control for humanoid robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13196v1-abstract-full').style.display = 'none'; document.getElementById('2412.13196v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">website: https://exbody2.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11938">arXiv:2412.11938</a> <span> [<a href="https://arxiv.org/pdf/2412.11938">pdf</a>, <a href="https://arxiv.org/format/2412.11938">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Are the Latent Representations of Foundation Models for Pathology Invariant to Rotation? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Elphick%2C+M">Matou拧 Elphick</a>, <a href="/search/cs?searchtype=author&query=Turajlic%2C+S">Samra Turajlic</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11938v1-abstract-short" style="display: inline;"> Self-supervised foundation models for digital pathology encode small patches from H\&E whole slide images into latent representations used for downstream tasks. However, the invariance of these representations to patch rotation remains unexplored. This study investigates the rotational invariance of latent representations across twelve foundation models by quantifying the alignment between non-rot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11938v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11938v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11938v1-abstract-full" style="display: none;"> Self-supervised foundation models for digital pathology encode small patches from H\&E whole slide images into latent representations used for downstream tasks. However, the invariance of these representations to patch rotation remains unexplored. This study investigates the rotational invariance of latent representations across twelve foundation models by quantifying the alignment between non-rotated and rotated patches using mutual $k$-nearest neighbours and cosine distance. Models that incorporated rotation augmentation during self-supervised training exhibited significantly greater invariance to rotations. We hypothesise that the absence of rotational inductive bias in the transformer architecture necessitates rotation augmentation during training to achieve learned invariance. Code: https://github.com/MatousE/rot-invariance-analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11938v1-abstract-full').style.display = 'none'; document.getElementById('2412.11938v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Samra Turajlic and Guang Yang are joint last authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11087">arXiv:2412.11087</a> <span> [<a href="https://arxiv.org/pdf/2412.11087">pdf</a>, <a href="https://arxiv.org/format/2412.11087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Large Vision-Language Model as User Intent-aware Encoder for Composed Image Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zelong Sun</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+D">Dong Jing</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guoxing Yang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+N">Nanyi Fei</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhiwu Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11087v1-abstract-short" style="display: inline;"> Composed Image Retrieval (CIR) aims to retrieve target images from candidate set using a hybrid-modality query consisting of a reference image and a relative caption that describes the user intent. Recent studies attempt to utilize Vision-Language Pre-training Models (VLPMs) with various fusion strategies for addressing the task.However, these methods typically fail to simultaneously meet two key… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11087v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11087v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11087v1-abstract-full" style="display: none;"> Composed Image Retrieval (CIR) aims to retrieve target images from candidate set using a hybrid-modality query consisting of a reference image and a relative caption that describes the user intent. Recent studies attempt to utilize Vision-Language Pre-training Models (VLPMs) with various fusion strategies for addressing the task.However, these methods typically fail to simultaneously meet two key requirements of CIR: comprehensively extracting visual information and faithfully following the user intent. In this work, we propose CIR-LVLM, a novel framework that leverages the large vision-language model (LVLM) as the powerful user intent-aware encoder to better meet these requirements. Our motivation is to explore the advanced reasoning and instruction-following capabilities of LVLM for accurately understanding and responding the user intent. Furthermore, we design a novel hybrid intent instruction module to provide explicit intent guidance at two levels: (1) The task prompt clarifies the task requirement and assists the model in discerning user intent at the task level. (2) The instance-specific soft prompt, which is adaptively selected from the learnable prompt pool, enables the model to better comprehend the user intent at the instance level compared to a universal prompt for all instances. CIR-LVLM achieves state-of-the-art performance across three prominent benchmarks with acceptable inference efficiency. We believe this study provides fundamental insights into CIR-related fields. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11087v1-abstract-full').style.display = 'none'; document.getElementById('2412.11087v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08587">arXiv:2412.08587</a> <span> [<a href="https://arxiv.org/pdf/2412.08587">pdf</a>, <a href="https://arxiv.org/ps/2412.08587">ps</a>, <a href="https://arxiv.org/format/2412.08587">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Advancing Single- and Multi-task Text Classification through Large Language Model Fine-tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hang Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q+P">Qile P. Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y+B">Yijing Barry Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08587v1-abstract-short" style="display: inline;"> Both encoder-only models (e.g., BERT, RoBERTa) and large language models (LLMs, e.g., Llama3) have been widely used for text classification tasks. However, there is a lack of systematic studies comparing the performance of encoder-based models and LLMs in text classification, particularly when fine-tuning is involved. This study employed a diverse range of models and methods, varying in size and a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08587v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08587v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08587v1-abstract-full" style="display: none;"> Both encoder-only models (e.g., BERT, RoBERTa) and large language models (LLMs, e.g., Llama3) have been widely used for text classification tasks. However, there is a lack of systematic studies comparing the performance of encoder-based models and LLMs in text classification, particularly when fine-tuning is involved. This study employed a diverse range of models and methods, varying in size and architecture, and including both fine-tuned and pre-trained approaches. We first assessed the performances of these LLMs on the 20 Newsgroups (20NG) and MASSIVE datasets, comparing them to encoder-only RoBERTa models. Additionally, we explored the multi-task capabilities of both model types by combining multiple classification tasks, including intent detection and slot-filling, into a single model using data from both datasets. Our results indicate that fully fine-tuned Llama3-70B models outperform RoBERTa-large and other decoder LLMs across various classification tasks and datasets. Moreover, the consolidated multi-task fine-tuned LLMs matched the performance of dual-model setups in both tasks across both datasets. Overall, our study provides a comprehensive benchmark of encoder-only and LLM models on text classification tasks and demonstrates a method to combine two or more fully fine-tuned decoder LLMs for reduced latency and equivalent performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08587v1-abstract-full').style.display = 'none'; document.getElementById('2412.08587v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08580">arXiv:2412.08580</a> <span> [<a href="https://arxiv.org/pdf/2412.08580">pdf</a>, <a href="https://arxiv.org/format/2412.08580">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LAION-SG: An Enhanced Large-Scale Dataset for Training Complex Image-Text Models with Structural Annotations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zejian Li</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+C">Chenye Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yize Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Ling Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiarui Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiayi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Changyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhiyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+J">Jinxiong Chang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lingyun Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08580v2-abstract-short" style="display: inline;"> Recent advances in text-to-image (T2I) generation have shown remarkable success in producing high-quality images from text. However, existing T2I models show decayed performance in compositional image generation involving multiple objects and intricate relationships. We attribute this problem to limitations in existing datasets of image-text pairs, which lack precise inter-object relationship anno… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08580v2-abstract-full').style.display = 'inline'; document.getElementById('2412.08580v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08580v2-abstract-full" style="display: none;"> Recent advances in text-to-image (T2I) generation have shown remarkable success in producing high-quality images from text. However, existing T2I models show decayed performance in compositional image generation involving multiple objects and intricate relationships. We attribute this problem to limitations in existing datasets of image-text pairs, which lack precise inter-object relationship annotations with prompts only. To address this problem, we construct LAION-SG, a large-scale dataset with high-quality structural annotations of scene graphs (SG), which precisely describe attributes and relationships of multiple objects, effectively representing the semantic structure in complex scenes. Based on LAION-SG, we train a new foundation model SDXL-SG to incorporate structural annotation information into the generation process. Extensive experiments show advanced models trained on our LAION-SG boast significant performance improvements in complex scene generation over models on existing datasets. We also introduce CompSG-Bench, a benchmark that evaluates models on compositional image generation, establishing a new standard for this domain. Our annotations with the associated processing code, the foundation model and the benchmark protocol are publicly available at https://github.com/mengcye/LAION-SG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08580v2-abstract-full').style.display = 'none'; document.getElementById('2412.08580v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07773">arXiv:2412.07773</a> <span> [<a href="https://arxiv.org/pdf/2412.07773">pdf</a>, <a href="https://arxiv.org/format/2412.07773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mobile-TeleVision: Predictive Motion Priors for Humanoid Whole-Body Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chenhao Lu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xuxin Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialong Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shiqi Yang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+M">Mazeyu Ji</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chengjing Yuan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Ge Yang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+S">Sha Yi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07773v1-abstract-short" style="display: inline;"> Humanoid robots require both robust lower-body locomotion and precise upper-body manipulation. While recent Reinforcement Learning (RL) approaches provide whole-body loco-manipulation policies, they lack precise manipulation with high DoF arms. In this paper, we propose decoupling upper-body control from locomotion, using inverse kinematics (IK) and motion retargeting for precise manipulation, whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07773v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07773v1-abstract-full" style="display: none;"> Humanoid robots require both robust lower-body locomotion and precise upper-body manipulation. While recent Reinforcement Learning (RL) approaches provide whole-body loco-manipulation policies, they lack precise manipulation with high DoF arms. In this paper, we propose decoupling upper-body control from locomotion, using inverse kinematics (IK) and motion retargeting for precise manipulation, while RL focuses on robust lower-body locomotion. We introduce PMP (Predictive Motion Priors), trained with Conditional Variational Autoencoder (CVAE) to effectively represent upper-body motions. The locomotion policy is trained conditioned on this upper-body motion representation, ensuring that the system remains robust with both manipulation and locomotion. We show that CVAE features are crucial for stability and robustness, and significantly outperforms RL-based whole-body control in precise manipulation. With precise upper-body motion and robust lower-body locomotion control, operators can remotely control the humanoid to walk around and explore different environments, while performing diverse manipulation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07773v1-abstract-full').style.display = 'none'; document.getElementById('2412.07773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07674">arXiv:2412.07674</a> <span> [<a href="https://arxiv.org/pdf/2412.07674">pdf</a>, <a href="https://arxiv.org/format/2412.07674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FiVA: Fine-grained Visual Attribute Dataset for Text-to-Image Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tong Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yinghao Xu</a>, <a href="/search/cs?searchtype=author&query=Po%2C+R">Ryan Po</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guandao Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=Wetzstein%2C+G">Gordon Wetzstein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07674v1-abstract-short" style="display: inline;"> Recent advances in text-to-image generation have enabled the creation of high-quality images with diverse applications. However, accurately describing desired visual attributes can be challenging, especially for non-experts in art and photography. An intuitive solution involves adopting favorable attributes from the source images. Current methods attempt to distill identity and style from source i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07674v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07674v1-abstract-full" style="display: none;"> Recent advances in text-to-image generation have enabled the creation of high-quality images with diverse applications. However, accurately describing desired visual attributes can be challenging, especially for non-experts in art and photography. An intuitive solution involves adopting favorable attributes from the source images. Current methods attempt to distill identity and style from source images. However, "style" is a broad concept that includes texture, color, and artistic elements, but does not cover other important attributes such as lighting and dynamics. Additionally, a simplified "style" adaptation prevents combining multiple attributes from different sources into one generated image. In this work, we formulate a more effective approach to decompose the aesthetics of a picture into specific visual attributes, allowing users to apply characteristics such as lighting, texture, and dynamics from different images. To achieve this goal, we constructed the first fine-grained visual attributes dataset (FiVA) to the best of our knowledge. This FiVA dataset features a well-organized taxonomy for visual attributes and includes around 1 M high-quality generated images with visual attribute annotations. Leveraging this dataset, we propose a fine-grained visual attribute adaptation framework (FiVA-Adapter), which decouples and adapts visual attributes from one or more source images into a generated one. This approach enhances user-friendly customization, allowing users to selectively apply desired attributes to create images that meet their unique preferences and specific content requirements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07674v1-abstract-full').style.display = 'none'; document.getElementById('2412.07674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 (Datasets and Benchmarks Track); Project page: https://fiva-dataset.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06785">arXiv:2412.06785</a> <span> [<a href="https://arxiv.org/pdf/2412.06785">pdf</a>, <a href="https://arxiv.org/format/2412.06785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Tactile DreamFusion: Exploiting Tactile Sensing for 3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+R">Ruihan Gao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+K">Kangle Deng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gengshan Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wenzhen Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun-Yan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06785v1-abstract-short" style="display: inline;"> 3D generation methods have shown visually compelling results powered by diffusion image priors. However, they often fail to produce realistic geometric details, resulting in overly smooth surfaces or geometric details inaccurately baked in albedo maps. To address this, we introduce a new method that incorporates touch as an additional modality to improve the geometric details of generated 3D asset… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06785v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06785v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06785v1-abstract-full" style="display: none;"> 3D generation methods have shown visually compelling results powered by diffusion image priors. However, they often fail to produce realistic geometric details, resulting in overly smooth surfaces or geometric details inaccurately baked in albedo maps. To address this, we introduce a new method that incorporates touch as an additional modality to improve the geometric details of generated 3D assets. We design a lightweight 3D texture field to synthesize visual and tactile textures, guided by 2D diffusion model priors on both visual and tactile domains. We condition the visual texture generation on high-resolution tactile normals and guide the patch-based tactile texture refinement with a customized TextureDreambooth. We further present a multi-part generation pipeline that enables us to synthesize different textures across various regions. To our knowledge, we are the first to leverage high-resolution tactile sensing to enhance geometric details for 3D generation tasks. We evaluate our method in both text-to-3D and image-to-3D settings. Our experiments demonstrate that our method provides customized and realistic fine geometric textures while maintaining accurate alignment between two modalities of vision and touch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06785v1-abstract-full').style.display = 'none'; document.getElementById('2412.06785v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024. Project webpage: https://ruihangao.github.io/TactileDreamFusion/ Code: https://github.com/RuihanGao/TactileDreamFusion</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06192">arXiv:2412.06192</a> <span> [<a href="https://arxiv.org/pdf/2412.06192">pdf</a>, <a href="https://arxiv.org/format/2412.06192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> PoLaRIS Dataset: A Maritime Object Detection and Tracking Dataset in Pohang Canal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+J">Jiwon Choi</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+D">Dongjin Cho</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gihyeon Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hogyun Kim</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Geonmo Yang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Joowan Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+Y">Younggun Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06192v2-abstract-short" style="display: inline;"> Maritime environments often present hazardous situations due to factors such as moving ships or buoys, which become obstacles under the influence of waves. In such challenging conditions, the ability to detect and track potentially hazardous objects is critical for the safe navigation of marine robots. To address the scarcity of comprehensive datasets capturing these dynamic scenarios, we introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06192v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06192v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06192v2-abstract-full" style="display: none;"> Maritime environments often present hazardous situations due to factors such as moving ships or buoys, which become obstacles under the influence of waves. In such challenging conditions, the ability to detect and track potentially hazardous objects is critical for the safe navigation of marine robots. To address the scarcity of comprehensive datasets capturing these dynamic scenarios, we introduce a new multi-modal dataset that includes image and point-wise annotations of maritime hazards. Our dataset provides detailed ground truth for obstacle detection and tracking, including objects as small as 10$\times$10 pixels, which are crucial for maritime safety. To validate the dataset's effectiveness as a reliable benchmark, we conducted evaluations using various methodologies, including \ac{SOTA} techniques for object detection and tracking. These evaluations are expected to contribute to performance improvements, particularly in the complex maritime environment. To the best of our knowledge, this is the first dataset offering multi-modal annotations specifically tailored to maritime environments. Our dataset is available at https://sites.google.com/view/polaris-dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06192v2-abstract-full').style.display = 'none'; document.getElementById('2412.06192v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06161">arXiv:2412.06161</a> <span> [<a href="https://arxiv.org/pdf/2412.06161">pdf</a>, <a href="https://arxiv.org/format/2412.06161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Harpagon: Minimizing DNN Serving Cost via Efficient Dispatching, Scheduling and Splitting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhixin Zhao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yitao Hu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Ziqi Gong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guotao Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenxin Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiulong Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Keqiu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06161v1-abstract-short" style="display: inline;"> Advances in deep neural networks (DNNs) have significantly contributed to the development of real-time video processing applications. Efficient scheduling of DNN workloads in cloud-hosted inference systems is crucial to minimizing serving costs while meeting application latency constraints. However, existing systems suffer from excessive module latency during request dispatching, low execution thr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06161v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06161v1-abstract-full" style="display: none;"> Advances in deep neural networks (DNNs) have significantly contributed to the development of real-time video processing applications. Efficient scheduling of DNN workloads in cloud-hosted inference systems is crucial to minimizing serving costs while meeting application latency constraints. However, existing systems suffer from excessive module latency during request dispatching, low execution throughput during module scheduling, and wasted latency budget during latency splitting for multi-DNN application, which undermines their capability to minimize the serving cost. In this paper, we design a DNN inference system called Harpagon, which minimizes the serving cost under latency constraints with a three-level design. It first maximizes the batch collection rate with a batch-aware request dispatch policy to minimize the module latency. It then maximizes the module throughput with multi-tuple configurations and proper amount of dummy requests. It also carefully splits the end-to-end latency into per-module latency budget to minimize the total serving cost for multi-DNN applications. Evaluation shows that Harpagon outperforms the state of the art by 1.49 to 2.37 times in serving cost while satisfying the latency objectives. Additionally, compared to the optimal solution using brute force search, Harpagon derives the lower bound of serving cost for 91.5% workloads with millisecond level runtime. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06161v1-abstract-full').style.display = 'none'; document.getElementById('2412.06161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE INFOCOM 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03937">arXiv:2412.03937</a> <span> [<a href="https://arxiv.org/pdf/2412.03937">pdf</a>, <a href="https://arxiv.org/format/2412.03937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AIpparel: A Large Multimodal Generative Model for Digital Garments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nakayama%2C+K">Kiyohiro Nakayama</a>, <a href="/search/cs?searchtype=author&query=Ackermann%2C+J">Jan Ackermann</a>, <a href="/search/cs?searchtype=author&query=Kesdogan%2C+T+L">Timur Levent Kesdogan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yang Zheng</a>, <a href="/search/cs?searchtype=author&query=Korosteleva%2C+M">Maria Korosteleva</a>, <a href="/search/cs?searchtype=author&query=Sorkine-Hornung%2C+O">Olga Sorkine-Hornung</a>, <a href="/search/cs?searchtype=author&query=Guibas%2C+L+J">Leonidas J. Guibas</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guandao Yang</a>, <a href="/search/cs?searchtype=author&query=Wetzstein%2C+G">Gordon Wetzstein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03937v3-abstract-short" style="display: inline;"> Apparel is essential to human life, offering protection, mirroring cultural identities, and showcasing personal style. Yet, the creation of garments remains a time-consuming process, largely due to the manual work involved in designing them. To simplify this process, we introduce AIpparel, a large multimodal model for generating and editing sewing patterns. Our model fine-tunes state-of-the-art la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03937v3-abstract-full').style.display = 'inline'; document.getElementById('2412.03937v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03937v3-abstract-full" style="display: none;"> Apparel is essential to human life, offering protection, mirroring cultural identities, and showcasing personal style. Yet, the creation of garments remains a time-consuming process, largely due to the manual work involved in designing them. To simplify this process, we introduce AIpparel, a large multimodal model for generating and editing sewing patterns. Our model fine-tunes state-of-the-art large multimodal models (LMMs) on a custom-curated large-scale dataset of over 120,000 unique garments, each with multimodal annotations including text, images, and sewing patterns. Additionally, we propose a novel tokenization scheme that concisely encodes these complex sewing patterns so that LLMs can learn to predict them efficiently. AIpparel achieves state-of-the-art performance in single-modal tasks, including text-to-garment and image-to-garment prediction, and enables novel multimodal garment generation applications such as interactive garment editing. The project website is at georgenakayama.github.io/AIpparel/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03937v3-abstract-full').style.display = 'none'; document.getElementById('2412.03937v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The project website is at georgenakayama.github.io/AIpparel/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03249">arXiv:2412.03249</a> <span> [<a href="https://arxiv.org/pdf/2412.03249">pdf</a>, <a href="https://arxiv.org/format/2412.03249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> MLQM: Machine Learning Approach for Accelerating Optimal Qubit Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wenjie Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lianhui Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhigang Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Geng Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guowu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03249v1-abstract-short" style="display: inline;"> Quantum circuit mapping is a critical process in quantum computing that involves adapting logical quantum circuits to adhere to hardware constraints, thereby generating physically executable quantum circuits. Current quantum circuit mapping techniques, such as solver-based methods, often encounter challenges related to slow solving speeds due to factors like redundant search iterations. Regarding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03249v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03249v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03249v1-abstract-full" style="display: none;"> Quantum circuit mapping is a critical process in quantum computing that involves adapting logical quantum circuits to adhere to hardware constraints, thereby generating physically executable quantum circuits. Current quantum circuit mapping techniques, such as solver-based methods, often encounter challenges related to slow solving speeds due to factors like redundant search iterations. Regarding this issue, we propose a machine learning approach for accelerating optimal qubit mapping (MLQM). First, the method proposes a global search space pruning scheme based on prior knowledge and machine learning, which in turn improves the solution efficiency. Second, to address the limited availability of effective samples in the learning task, MLQM introduces a novel data augmentation and refinement scheme, this scheme enhances the size and diversity of the quantum circuit dataset by exploiting gate allocation and qubit rearrangement. Finally, MLQM also further improves the solution efficiency by pruning the local search space, which is achieved through an adaptive dynamic adjustment mechanism of the solver variables. Compared to state-of-the-art qubit mapping approaches, MLQM achieves optimal qubit mapping with an average solving speed-up ratio of 1.79 and demonstrates an average advantage of 22% in terms of space complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03249v1-abstract-full').style.display = 'none'; document.getElementById('2412.03249v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02779">arXiv:2412.02779</a> <span> [<a href="https://arxiv.org/pdf/2412.02779">pdf</a>, <a href="https://arxiv.org/format/2412.02779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Synergistic Development of Perovskite Memristors and Algorithms for Robust Analog Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+N">Nanyang Ye</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qiao Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifei Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liujia Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jundong Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang-Zhong Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinbing Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chenghu Zhou</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+W">Wei Ren</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+L">Leilei Gu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Huaqiang Wu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Qinying Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02779v2-abstract-short" style="display: inline;"> Analog computing using non-volatile memristors has emerged as a promising solution for energy-efficient deep learning. New materials, like perovskites-based memristors are recently attractive due to their cost-effectiveness, energy efficiency and flexibility. Yet, challenges in material diversity and immature fabrications require extensive experimentation for device development. Moreover, signific… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02779v2-abstract-full').style.display = 'inline'; document.getElementById('2412.02779v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02779v2-abstract-full" style="display: none;"> Analog computing using non-volatile memristors has emerged as a promising solution for energy-efficient deep learning. New materials, like perovskites-based memristors are recently attractive due to their cost-effectiveness, energy efficiency and flexibility. Yet, challenges in material diversity and immature fabrications require extensive experimentation for device development. Moreover, significant non-idealities in these memristors often impede them for computing. Here, we propose a synergistic methodology to concurrently optimize perovskite memristor fabrication and develop robust analog DNNs that effectively address the inherent non-idealities of these memristors. Employing Bayesian optimization (BO) with a focus on usability, we efficiently identify optimal materials and fabrication conditions for perovskite memristors. Meanwhile, we developed "BayesMulti", a DNN training strategy utilizing BO-guided noise injection to improve the resistance of analog DNNs to memristor imperfections. Our approach theoretically ensures that within a certain range of parameter perturbations due to memristor non-idealities, the prediction outcomes remain consistent. Our integrated approach enables use of analog computing in much deeper and wider networks, which significantly outperforms existing methods in diverse tasks like image classification, autonomous driving, species identification, and large vision-language models, achieving up to 100-fold improvements. We further validate our methodology on a 10$\times$10 optimized perovskite memristor crossbar, demonstrating high accuracy in a classification task and low energy consumption. This study offers a versatile solution for efficient optimization of various analog computing systems, encompassing both devices and algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02779v2-abstract-full').style.display = 'none'; document.getElementById('2412.02779v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01959">arXiv:2412.01959</a> <span> [<a href="https://arxiv.org/pdf/2412.01959">pdf</a>, <a href="https://arxiv.org/format/2412.01959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Development and Application of a Decentralized Domain Name Service </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01959v2-abstract-short" style="display: inline;"> The current Domain Name System (DNS), as a core infrastructure of the internet, exhibits several shortcomings: its centralized architecture leads to censorship risks and single points of failure, making domain name resolution vulnerable to attacks. The lack of encryption in the resolution process exposes it to DNS hijacking and cache poisoning attacks. Additionally, the high operational costs limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01959v2-abstract-full').style.display = 'inline'; document.getElementById('2412.01959v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01959v2-abstract-full" style="display: none;"> The current Domain Name System (DNS), as a core infrastructure of the internet, exhibits several shortcomings: its centralized architecture leads to censorship risks and single points of failure, making domain name resolution vulnerable to attacks. The lack of encryption in the resolution process exposes it to DNS hijacking and cache poisoning attacks. Additionally, the high operational costs limit participation and innovation among small to medium-sized users. To address these issues, this paper proposes a Decentralized Domain Name Service (DDNS) based on blockchain (Phicoin) and distributed storage (IPFS). By leveraging the immutability of blockchain and the content verification of IPFS, the system achieves decentralized storage and distribution of domain name records, eliminating the centralized dependencies of traditional DNS. With a block time of 15 seconds, the system supports rapid broadcasting of domain name updates, significantly improving resolution efficiency. The DDNS aims to serve as a complement or backup to the existing DNS system, providing a pollution-resistant, censorship-resistant, high-performance, and low-cost domain name resolution solution, offering a new technical path for the security and stability of the internet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01959v2-abstract-full').style.display = 'none'; document.getElementById('2412.01959v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15551">arXiv:2411.15551</a> <span> [<a href="https://arxiv.org/pdf/2411.15551">pdf</a>, <a href="https://arxiv.org/format/2411.15551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeRF Inpainting with Geometric Diffusion Prior and Balanced Score Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Menglin Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xin Luo</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+Y">Yunwei Lan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Ganlin Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15551v1-abstract-short" style="display: inline;"> Recent advances in NeRF inpainting have leveraged pretrained diffusion models to enhance performance. However, these methods often yield suboptimal results due to their ineffective utilization of 2D diffusion priors. The limitations manifest in two critical aspects: the inadequate capture of geometric information by pretrained diffusion models and the suboptimal guidance provided by existing Score… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15551v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15551v1-abstract-full" style="display: none;"> Recent advances in NeRF inpainting have leveraged pretrained diffusion models to enhance performance. However, these methods often yield suboptimal results due to their ineffective utilization of 2D diffusion priors. The limitations manifest in two critical aspects: the inadequate capture of geometric information by pretrained diffusion models and the suboptimal guidance provided by existing Score Distillation Sampling (SDS) methods. To address these problems, we introduce GB-NeRF, a novel framework that enhances NeRF inpainting through improved utilization of 2D diffusion priors. Our approach incorporates two key innovations: a fine-tuning strategy that simultaneously learns appearance and geometric priors and a specialized normal distillation loss that integrates these geometric priors into NeRF inpainting. We propose a technique called Balanced Score Distillation (BSD) that surpasses existing methods such as Score Distillation (SDS) and the improved version, Conditional Score Distillation (CSD). BSD offers improved inpainting quality in appearance and geometric aspects. Extensive experiments show that our method provides superior appearance fidelity and geometric consistency compared to existing approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15551v1-abstract-full').style.display = 'none'; document.getElementById('2411.15551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15203">arXiv:2411.15203</a> <span> [<a href="https://arxiv.org/pdf/2411.15203">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multimodal large language model for wheat breeding: a new exploration of smart breeding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guofeng Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yong He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhenjiang Zhou</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+L">Lingzhen Ye</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hui Fang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yiqi Luo</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xuping Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15203v1-abstract-short" style="display: inline;"> UAV remote sensing technology has become a key technology in crop breeding, which can achieve high-throughput and non-destructive collection of crop phenotyping data. However, the multidisciplinary nature of breeding has brought technical barriers and efficiency challenges to knowledge mining. Therefore, it is important to develop a smart breeding goal tool to mine cross-domain multimodal data. Ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15203v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15203v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15203v1-abstract-full" style="display: none;"> UAV remote sensing technology has become a key technology in crop breeding, which can achieve high-throughput and non-destructive collection of crop phenotyping data. However, the multidisciplinary nature of breeding has brought technical barriers and efficiency challenges to knowledge mining. Therefore, it is important to develop a smart breeding goal tool to mine cross-domain multimodal data. Based on different pre-trained open-source multimodal large language models (MLLMs) (e.g., Qwen-VL, InternVL, Deepseek-VL), this study used supervised fine-tuning (SFT), retrieval-augmented generation (RAG), and reinforcement learning from human feedback (RLHF) technologies to inject cross-domain knowledge into MLLMs, thereby constructing multiple multimodal large language models for wheat breeding (WBLMs). The above WBLMs were evaluated using the newly created evaluation benchmark in this study. The results showed that the WBLM constructed using SFT, RAG and RLHF technologies and InternVL2-8B has leading performance. Then, subsequent experiments were conducted using the WBLM. Ablation experiments indicated that the combination of SFT, RAG, and RLHF technologies can improve the overall generation performance, enhance the generated quality, balance the timeliness and adaptability of the generated answer, and reduce hallucinations and biases. The WBLM performed best in wheat yield prediction using cross-domain data (remote sensing, phenotyping, weather, germplasm) simultaneously, with R2 and RMSE of 0.821 and 489.254 kg/ha, respectively. Furthermore, the WBLM can generate professional decision support answers for phenotyping estimation, environmental stress assessment, target germplasm screening, cultivation technique recommendation, and seed price query tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15203v1-abstract-full').style.display = 'none'; document.getElementById('2411.15203v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15131">arXiv:2411.15131</a> <span> [<a href="https://arxiv.org/pdf/2411.15131">pdf</a>, <a href="https://arxiv.org/format/2411.15131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> WildLMa: Long Horizon Loco-Manipulation in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+R">Ri-Zhao Qiu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuchen Song</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xuanbin Peng</a>, <a href="/search/cs?searchtype=author&query=Suryadevara%2C+S+A">Sai Aneesh Suryadevara</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Ge Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghuan Liu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+M">Mazeyu Ji</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+C">Chengzhe Jia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+R">Ruihan Yang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+X">Xueyan Zou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15131v1-abstract-short" style="display: inline;"> `In-the-wild' mobile manipulation aims to deploy robots in diverse real-world environments, which requires the robot to (1) have skills that generalize across object configurations; (2) be capable of long-horizon task execution in diverse environments; and (3) perform complex manipulation beyond pick-and-place. Quadruped robots with manipulators hold promise for extending the workspace and enablin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15131v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15131v1-abstract-full" style="display: none;"> `In-the-wild' mobile manipulation aims to deploy robots in diverse real-world environments, which requires the robot to (1) have skills that generalize across object configurations; (2) be capable of long-horizon task execution in diverse environments; and (3) perform complex manipulation beyond pick-and-place. Quadruped robots with manipulators hold promise for extending the workspace and enabling robust locomotion, but existing results do not investigate such a capability. This paper proposes WildLMa with three components to address these issues: (1) adaptation of learned low-level controller for VR-enabled whole-body teleoperation and traversability; (2) WildLMa-Skill -- a library of generalizable visuomotor skills acquired via imitation learning or heuristics and (3) WildLMa-Planner -- an interface of learned skills that allow LLM planners to coordinate skills for long-horizon tasks. We demonstrate the importance of high-quality training data by achieving higher grasping success rate over existing RL baselines using only tens of demonstrations. WildLMa exploits CLIP for language-conditioned imitation learning that empirically generalizes to objects unseen in training demonstrations. Besides extensive quantitative evaluation, we qualitatively demonstrate practical robot applications, such as cleaning up trash in university hallways or outdoor terrains, operating articulated objects, and rearranging items on a bookshelf. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15131v1-abstract-full').style.display = 'none'; document.getElementById('2411.15131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://wildlma.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15066">arXiv:2411.15066</a> <span> [<a href="https://arxiv.org/pdf/2411.15066">pdf</a>, <a href="https://arxiv.org/format/2411.15066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SPAC-Net: Rethinking Point Cloud Completion with Structural Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zizhao Wu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jian Shi</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xuan Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Genfu Yang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+M">Ming Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunhai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15066v1-abstract-short" style="display: inline;"> Point cloud completion aims to infer a complete shape from its partial observation. Many approaches utilize a pure encoderdecoder paradigm in which complete shape can be directly predicted by shape priors learned from partial scans, however, these methods suffer from the loss of details inevitably due to the feature abstraction issues. In this paper, we propose a novel framework,termed SPAC-Net, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15066v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15066v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15066v1-abstract-full" style="display: none;"> Point cloud completion aims to infer a complete shape from its partial observation. Many approaches utilize a pure encoderdecoder paradigm in which complete shape can be directly predicted by shape priors learned from partial scans, however, these methods suffer from the loss of details inevitably due to the feature abstraction issues. In this paper, we propose a novel framework,termed SPAC-Net, that aims to rethink the completion task under the guidance of a new structural prior, we call it interface. Specifically, our method first investigates Marginal Detector (MAD) module to localize the interface, defined as the intersection between the known observation and the missing parts. Based on the interface, our method predicts the coarse shape by learning the displacement from the points in interface move to their corresponding position in missing parts. Furthermore, we devise an additional Structure Supplement(SSP) module before the upsampling stage to enhance the structural details of the coarse shape, enabling the upsampling module to focus more on the upsampling task. Extensive experiments have been conducted on several challenging benchmarks, and the results demonstrate that our method outperforms existing state-of-the-art approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15066v1-abstract-full').style.display = 'none'; document.getElementById('2411.15066v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12360">arXiv:2411.12360</a> <span> [<a href="https://arxiv.org/pdf/2411.12360">pdf</a>, <a href="https://arxiv.org/ps/2411.12360">ps</a>, <a href="https://arxiv.org/format/2411.12360">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> An Affine Equivalence Algorithm for S-boxes based on Matrix Invariants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xincheng Hu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xiao Zeng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaoqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guowu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12360v1-abstract-short" style="display: inline;"> We investigate the affine equivalence (AE) problem of S-boxes. Given two S-boxes denoted as $S_1$ and $S_2$, we aim to seek two invertible AE transformations $A,B$ such that $S_1\circ A = B\circ S_2$ holds. Due to important applications in the analysis and design of block ciphers, the investigation of AE algorithms has performed growing significance. In this paper, we propose zeroization on S-bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12360v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12360v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12360v1-abstract-full" style="display: none;"> We investigate the affine equivalence (AE) problem of S-boxes. Given two S-boxes denoted as $S_1$ and $S_2$, we aim to seek two invertible AE transformations $A,B$ such that $S_1\circ A = B\circ S_2$ holds. Due to important applications in the analysis and design of block ciphers, the investigation of AE algorithms has performed growing significance. In this paper, we propose zeroization on S-box firstly, and the AE problem can be transformed into $2^n$ linear equivalence problems by this zeroization operation. Secondly, we propose standard orthogonal spatial matrix (SOSM), and the rank of the SOSM is invariant under AE transformations. Finally, based on the zeroization operation and the SOSM method, we propose a depth first search (DFS) method for determining AE of S-boxes, named the AE\_SOSM\_DFS algorithm. Using this matrix invariant, we optimize the temporal complexity of the algorithm to approximately $\frac{1}{2^n}$ of the complexity without SOSM. Specifically, the complexity of our algorithm is $O(2^{3n})$. In addition, we also conducted experiments with non-invertible S-boxes, and the performance is similar to that of invertible S-boxes. Moreover, our proposed algorithm can effectively handle S-boxes with low algebraic degree or certain popular S-boxes such as namely AES and ARIA\_s2, which are difficult to be handled by the algorithm proposed by Dinur (2018). Using our algorithm, it only takes 5.5 seconds to find out that the seven popular S-boxes namely AES, ARIA\_s2, Camellia, Chiasmus, DBlock, SEED\_S0, and SMS4 are affine equivalent and the AE transformations of these S-boxes are provided. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12360v1-abstract-full').style.display = 'none'; document.getElementById('2411.12360v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+G&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+G&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+G&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository