Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,021 results for author: <span class="mathjax">Yang, W</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yang%2C+W">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yang, W"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yang%2C+W&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yang, W"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09311">arXiv:2502.09311</a> <span> [<a href="https://arxiv.org/pdf/2502.09311">pdf</a>, <a href="https://arxiv.org/format/2502.09311">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mitigating the Impact of Prominent Position Shift in Drone-based RGBT Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wen Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chang Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qian Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fang Xu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+G">Gui-Song Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09311v1-abstract-short" style="display: inline;"> Drone-based RGBT object detection plays a crucial role in many around-the-clock applications. However, real-world drone-viewed RGBT data suffers from the prominent position shift problem, i.e., the position of a tiny object differs greatly in different modalities. For instance, a slight deviation of a tiny object in the thermal modality will induce it to drift from the main body of itself in the R… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09311v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09311v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09311v1-abstract-full" style="display: none;"> Drone-based RGBT object detection plays a crucial role in many around-the-clock applications. However, real-world drone-viewed RGBT data suffers from the prominent position shift problem, i.e., the position of a tiny object differs greatly in different modalities. For instance, a slight deviation of a tiny object in the thermal modality will induce it to drift from the main body of itself in the RGB modality. Considering RGBT data are usually labeled on one modality (reference), this will cause the unlabeled modality (sensed) to lack accurate supervision signals and prevent the detector from learning a good representation. Moreover, the mismatch of the corresponding feature point between the modalities will make the fused features confusing for the detection head. In this paper, we propose to cast the cross-modality box shift issue as the label noise problem and address it on the fly via a novel Mean Teacher-based Cross-modality Box Correction head ensemble (CBC). In this way, the network can learn more informative representations for both modalities. Furthermore, to alleviate the feature map mismatch problem in RGBT fusion, we devise a Shifted Window-Based Cascaded Alignment (SWCA) module. SWCA mines long-range dependencies between the spatially unaligned features inside shifted windows and cascaded aligns the sensed features with the reference ones. Extensive experiments on two drone-based RGBT object detection datasets demonstrate that the correction results are both visually and quantitatively favorable, thereby improving the detection performance. In particular, our CBC module boosts the precision of the sensed modality ground truth by 25.52 aSim points. Overall, the proposed detector achieves an mAP_50 of 43.55 points on RGBTDronePerson and surpasses a state-of-the-art method by 8.6 mAP50 on a shift subset of DroneVehicle dataset. The code and data will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09311v1-abstract-full').style.display = 'none'; document.getElementById('2502.09311v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08093">arXiv:2502.08093</a> <span> [<a href="https://arxiv.org/pdf/2502.08093">pdf</a>, <a href="https://arxiv.org/format/2502.08093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Ground-Optimized 4D Radar-Inertial Odometry via Continuous Velocity Integration using Gaussian Process </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wooseong Yang</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+H">Hyesu Jang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+A">Ayoung Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08093v1-abstract-short" style="display: inline;"> Radar ensures robust sensing capabilities in adverse weather conditions, yet challenges remain due to its high inherent noise level. Existing radar odometry has overcome these challenges with strategies such as filtering spurious points, exploiting Doppler velocity, or integrating with inertial measurements. This paper presents two novel improvements beyond the existing radar-inertial odometry: gr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08093v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08093v1-abstract-full" style="display: none;"> Radar ensures robust sensing capabilities in adverse weather conditions, yet challenges remain due to its high inherent noise level. Existing radar odometry has overcome these challenges with strategies such as filtering spurious points, exploiting Doppler velocity, or integrating with inertial measurements. This paper presents two novel improvements beyond the existing radar-inertial odometry: ground-optimized noise filtering and continuous velocity preintegration. Despite the widespread use of ground planes in LiDAR odometry, imprecise ground point distributions of radar measurements cause naive plane fitting to fail. Unlike plane fitting in LiDAR, we introduce a zone-based uncertainty-aware ground modeling specifically designed for radar. Secondly, we note that radar velocity measurements can be better combined with IMU for a more accurate preintegration in radar-inertial odometry. Existing methods often ignore temporal discrepancies between radar and IMU by simplifying the complexities of asynchronous data streams with discretized propagation models. Tackling this issue, we leverage GP and formulate a continuous preintegration method for tightly integrating 3-DOF linear velocity with IMU, facilitating full 6-DOF motion directly from the raw measurements. Our approach demonstrates remarkable performance (less than 1% vertical drift) in public datasets with meticulous conditions, illustrating substantial improvement in elevation accuracy. The code will be released as open source for the community: https://github.com/wooseongY/Go-RIO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08093v1-abstract-full').style.display = 'none'; document.getElementById('2502.08093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 7 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07814">arXiv:2502.07814</a> <span> [<a href="https://arxiv.org/pdf/2502.07814">pdf</a>, <a href="https://arxiv.org/format/2502.07814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> Satellite Observations Guided Diffusion Model for Accurate Meteorological States at Arbitrary Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tu%2C+S">Siwei Tu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+B">Ben Fei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Weidong Yang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+F">Fenghua Ling</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zili Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kun Chen</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Hang Fan</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07814v1-abstract-short" style="display: inline;"> Accurate acquisition of surface meteorological conditions at arbitrary locations holds significant importance for weather forecasting and climate simulation. Due to the fact that meteorological states derived from satellite observations are often provided in the form of low-resolution grid fields, the direct application of spatial interpolation to obtain meteorological states for specific location… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07814v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07814v1-abstract-full" style="display: none;"> Accurate acquisition of surface meteorological conditions at arbitrary locations holds significant importance for weather forecasting and climate simulation. Due to the fact that meteorological states derived from satellite observations are often provided in the form of low-resolution grid fields, the direct application of spatial interpolation to obtain meteorological states for specific locations often results in significant discrepancies when compared to actual observations. Existing downscaling methods for acquiring meteorological state information at higher resolutions commonly overlook the correlation with satellite observations. To bridge the gap, we propose Satellite-observations Guided Diffusion Model (SGD), a conditional diffusion model pre-trained on ERA5 reanalysis data with satellite observations (GridSat) as conditions, which is employed for sampling downscaled meteorological states through a zero-shot guided sampling strategy and patch-based methods. During the training process, we propose to fuse the information from GridSat satellite observations into ERA5 maps via the attention mechanism, enabling SGD to generate atmospheric states that align more accurately with actual conditions. In the sampling, we employed optimizable convolutional kernels to simulate the upscale process, thereby generating high-resolution ERA5 maps using low-resolution ERA5 maps as well as observations from weather stations as guidance. Moreover, our devised patch-based method promotes SGD to generate meteorological states at arbitrary resolutions. Experiments demonstrate SGD fulfills accurate meteorological states downscaling to 6.25km. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07814v1-abstract-full').style.display = 'none'; document.getElementById('2502.07814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07703">arXiv:2502.07703</a> <span> [<a href="https://arxiv.org/pdf/2502.07703">pdf</a>, <a href="https://arxiv.org/format/2502.07703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> GaRLIO: Gravity enhanced Radar-LiDAR-Inertial Odometry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Noh%2C+C">Chiyun Noh</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wooseong Yang</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+M">Minwoo Jung</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+S">Sangwoo Jung</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+A">Ayoung Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07703v1-abstract-short" style="display: inline;"> Recently, gravity has been highlighted as a crucial constraint for state estimation to alleviate potential vertical drift. Existing online gravity estimation methods rely on pose estimation combined with IMU measurements, which is considered best practice when direct velocity measurements are unavailable. However, with radar sensors providing direct velocity data-a measurement not yet utilized for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07703v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07703v1-abstract-full" style="display: none;"> Recently, gravity has been highlighted as a crucial constraint for state estimation to alleviate potential vertical drift. Existing online gravity estimation methods rely on pose estimation combined with IMU measurements, which is considered best practice when direct velocity measurements are unavailable. However, with radar sensors providing direct velocity data-a measurement not yet utilized for gravity estimation-we found a significant opportunity to improve gravity estimation accuracy substantially. GaRLIO, the proposed gravity-enhanced Radar-LiDAR-Inertial Odometry, can robustly predict gravity to reduce vertical drift while simultaneously enhancing state estimation performance using pointwise velocity measurements. Furthermore, GaRLIO ensures robustness in dynamic environments by utilizing radar to remove dynamic objects from LiDAR point clouds. Our method is validated through experiments in various environments prone to vertical drift, demonstrating superior performance compared to traditional LiDAR-Inertial Odometry methods. We make our source code publicly available to encourage further research and development. https://github.com/ChiyunNoh/GaRLIO <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07703v1-abstract-full').style.display = 'none'; document.getElementById('2502.07703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06392">arXiv:2502.06392</a> <span> [<a href="https://arxiv.org/pdf/2502.06392">pdf</a>, <a href="https://arxiv.org/format/2502.06392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> TANGLED: Generating 3D Hair Strands from Images with Arbitrary Styles and Viewpoints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+P">Pengyu Long</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zijun Zhao</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+M">Min Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qingcheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qixuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Lan Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jingyi Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06392v1-abstract-short" style="display: inline;"> Hairstyles are intricate and culturally significant with various geometries, textures, and structures. Existing text or image-guided generation methods fail to handle the richness and complexity of diverse styles. We present TANGLED, a novel approach for 3D hair strand generation that accommodates diverse image inputs across styles, viewpoints, and quantities of input views. TANGLED employs a thre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06392v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06392v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06392v1-abstract-full" style="display: none;"> Hairstyles are intricate and culturally significant with various geometries, textures, and structures. Existing text or image-guided generation methods fail to handle the richness and complexity of diverse styles. We present TANGLED, a novel approach for 3D hair strand generation that accommodates diverse image inputs across styles, viewpoints, and quantities of input views. TANGLED employs a three-step pipeline. First, our MultiHair Dataset provides 457 diverse hairstyles annotated with 74 attributes, emphasizing complex and culturally significant styles to improve model generalization. Second, we propose a diffusion framework conditioned on multi-view linearts that can capture topological cues (e.g., strand density and parting lines) while filtering out noise. By leveraging a latent diffusion model with cross-attention on lineart features, our method achieves flexible and robust 3D hair generation across diverse input conditions. Third, a parametric post-processing module enforces braid-specific constraints to maintain coherence in complex structures. This framework not only advances hairstyle realism and diversity but also enables culturally inclusive digital avatars and novel applications like sketch-based 3D strand editing for animation and augmented reality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06392v1-abstract-full').style.display = 'none'; document.getElementById('2502.06392v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://sites.google.com/view/tangled1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05902">arXiv:2502.05902</a> <span> [<a href="https://arxiv.org/pdf/2502.05902">pdf</a>, <a href="https://arxiv.org/format/2502.05902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fast Omni-Directional Image Super-Resolution: Adapting the Implicit Image Function with Pixel and Semantic-Wise Spherical Geometric Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xuelin Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yitong Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Silin Zheng</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+K">Kang Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05902v1-abstract-short" style="display: inline;"> In the context of Omni-Directional Image (ODI) Super-Resolution (SR), the unique challenge arises from the non-uniform oversampling characteristics caused by EquiRectangular Projection (ERP). Considerable efforts in designing complex spherical convolutions or polyhedron reprojection offer significant performance improvements but at the expense of cumbersome processing procedures and slower inferen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05902v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05902v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05902v1-abstract-full" style="display: none;"> In the context of Omni-Directional Image (ODI) Super-Resolution (SR), the unique challenge arises from the non-uniform oversampling characteristics caused by EquiRectangular Projection (ERP). Considerable efforts in designing complex spherical convolutions or polyhedron reprojection offer significant performance improvements but at the expense of cumbersome processing procedures and slower inference speeds. Under these circumstances, this paper proposes a new ODI-SR model characterized by its capacity to perform Fast and Arbitrary-scale ODI-SR processes, denoted as FAOR. The key innovation lies in adapting the implicit image function from the planar image domain to the ERP image domain by incorporating spherical geometric priors at both the latent representation and image reconstruction stages, in a low-overhead manner. Specifically, at the latent representation stage, we adopt a pair of pixel-wise and semantic-wise sphere-to-planar distortion maps to perform affine transformations on the latent representation, thereby incorporating it with spherical properties. Moreover, during the image reconstruction stage, we introduce a geodesic-based resampling strategy, aligning the implicit image function with spherical geometrics without introducing additional parameters. As a result, the proposed FAOR outperforms the state-of-the-art ODI-SR models with a much faster inference speed. Extensive experimental results and ablation studies have demonstrated the effectiveness of our design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05902v1-abstract-full').style.display = 'none'; document.getElementById('2502.05902v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures, AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05887">arXiv:2502.05887</a> <span> [<a href="https://arxiv.org/pdf/2502.05887">pdf</a>, <a href="https://arxiv.org/format/2502.05887">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MTPChat: A Multimodal Time-Aware Persona Dataset for Conversational Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wanqi Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanda Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05887v1-abstract-short" style="display: inline;"> Understanding temporal dynamics is critical for conversational agents, enabling effective content analysis and informed decision-making. However, time-aware datasets, particularly for persona-grounded conversations, are still limited, which narrows their scope and diminishes their complexity. To address this gap, we introduce MTPChat, a multimodal, time-aware persona dialogue dataset that integrat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05887v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05887v1-abstract-full" style="display: none;"> Understanding temporal dynamics is critical for conversational agents, enabling effective content analysis and informed decision-making. However, time-aware datasets, particularly for persona-grounded conversations, are still limited, which narrows their scope and diminishes their complexity. To address this gap, we introduce MTPChat, a multimodal, time-aware persona dialogue dataset that integrates linguistic, visual, and temporal elements within dialogue and persona memory. Leveraging MTPChat, we propose two time-sensitive tasks: Temporal Next Response Prediction (TNRP) and Temporal Grounding Memory Prediction (TGMP), both designed to assess a model's ability to understand implicit temporal cues and dynamic interactions. Additionally, we present an innovative framework featuring an adaptive temporal module to effectively integrate multimodal streams and capture temporal dependencies. Experimental results validate the challenges posed by MTPChat and demonstrate the effectiveness of our framework in multimodal time-sensitive scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05887v1-abstract-full').style.display = 'none'; document.getElementById('2502.05887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04592">arXiv:2502.04592</a> <span> [<a href="https://arxiv.org/pdf/2502.04592">pdf</a>, <a href="https://arxiv.org/format/2502.04592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> CAMEF: Causal-Augmented Multi-Modality Event-Driven Financial Forecasting by Integrating Time Series Patterns and Salient Macroeconomic Announcements </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenbo Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qiang Ma</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Jie Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04592v1-abstract-short" style="display: inline;"> Accurately forecasting the impact of macroeconomic events is critical for investors and policymakers. Salient events like monetary policy decisions and employment reports often trigger market movements by shaping expectations of economic growth and risk, thereby establishing causal relationships between events and market behavior. Existing forecasting methods typically focus either on textual anal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04592v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04592v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04592v1-abstract-full" style="display: none;"> Accurately forecasting the impact of macroeconomic events is critical for investors and policymakers. Salient events like monetary policy decisions and employment reports often trigger market movements by shaping expectations of economic growth and risk, thereby establishing causal relationships between events and market behavior. Existing forecasting methods typically focus either on textual analysis or time-series modeling, but fail to capture the multi-modal nature of financial markets and the causal relationship between events and price movements. To address these gaps, we propose CAMEF (Causal-Augmented Multi-Modality Event-Driven Financial Forecasting), a multi-modality framework that effectively integrates textual and time-series data with a causal learning mechanism and an LLM-based counterfactual event augmentation technique for causal-enhanced financial forecasting. Our contributions include: (1) a multi-modal framework that captures causal relationships between policy texts and historical price data; (2) a new financial dataset with six types of macroeconomic releases from 2008 to April 2024, and high-frequency real trading data for five key U.S. financial assets; and (3) an LLM-based counterfactual event augmentation strategy. We compare CAMEF to state-of-the-art transformer-based time-series and multi-modal baselines, and perform ablation studies to validate the effectiveness of the causal learning mechanism and event types. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04592v1-abstract-full').style.display = 'none'; document.getElementById('2502.04592v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03682">arXiv:2502.03682</a> <span> [<a href="https://arxiv.org/pdf/2502.03682">pdf</a>, <a href="https://arxiv.org/format/2502.03682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Towards Scalable Defenses against Intimate Partner Infiltrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Weisi Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shinan Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+F">Feng Xiao</a>, <a href="/search/cs?searchtype=author&query=Feamster%2C+N">Nick Feamster</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Stephen Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03682v1-abstract-short" style="display: inline;"> Intimate Partner Infiltration (IPI)--a type of Intimate Partner Violence (IPV) that typically requires physical access to a victim's device--is a pervasive concern in the United States, often manifesting through digital surveillance, control, and monitoring. Unlike conventional cyberattacks, IPI perpetrators leverage close proximity and personal knowledge to circumvent standard protections, unders… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03682v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03682v1-abstract-full" style="display: none;"> Intimate Partner Infiltration (IPI)--a type of Intimate Partner Violence (IPV) that typically requires physical access to a victim's device--is a pervasive concern in the United States, often manifesting through digital surveillance, control, and monitoring. Unlike conventional cyberattacks, IPI perpetrators leverage close proximity and personal knowledge to circumvent standard protections, underscoring the need for targeted interventions. While security clinics and other human-centered approaches effectively tailor solutions for survivors, their scalability remains constrained by resource limitations and the need for specialized counseling. In this paper, we present AID, an Automated IPI Detection system that continuously monitors for unauthorized access and suspicious behaviors on smartphones. AID employs a two-stage architecture to process multimodal signals stealthily and preserve user privacy. A brief calibration phase upon installation enables AID to adapt to each user's behavioral patterns, achieving high accuracy with minimal false alarms. Our 27-participant user study demonstrates that AID achieves highly accurate detection of non-owner access and fine-grained IPI-related activities, attaining an end-to-end top-3 F1 score of 0.981 with a false positive rate of 4%. These findings suggest that AID can serve as a forensic tool within security clinics, scaling their ability to identify IPI tactics and deliver personalized, far-reaching support to survivors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03682v1-abstract-full').style.display = 'none'; document.getElementById('2502.03682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01946">arXiv:2502.01946</a> <span> [<a href="https://arxiv.org/pdf/2502.01946">pdf</a>, <a href="https://arxiv.org/format/2502.01946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HeRCULES: Heterogeneous Radar Dataset in Complex Urban Environment for Multi-session Radar SLAM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hanjun Kim</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+M">Minwoo Jung</a>, <a href="/search/cs?searchtype=author&query=Noh%2C+C">Chiyun Noh</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+S">Sangwoo Jung</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Hyunho Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wooseong Yang</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+H">Hyesu Jang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+A">Ayoung Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01946v1-abstract-short" style="display: inline;"> Recently, radars have been widely featured in robotics for their robustness in challenging weather conditions. Two commonly used radar types are spinning radars and phased-array radars, each offering distinct sensor characteristics. Existing datasets typically feature only a single type of radar, leading to the development of algorithms limited to that specific kind. In this work, we highlight tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01946v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01946v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01946v1-abstract-full" style="display: none;"> Recently, radars have been widely featured in robotics for their robustness in challenging weather conditions. Two commonly used radar types are spinning radars and phased-array radars, each offering distinct sensor characteristics. Existing datasets typically feature only a single type of radar, leading to the development of algorithms limited to that specific kind. In this work, we highlight that combining different radar types offers complementary advantages, which can be leveraged through a heterogeneous radar dataset. Moreover, this new dataset fosters research in multi-session and multi-robot scenarios where robots are equipped with different types of radars. In this context, we introduce the HeRCULES dataset, a comprehensive, multi-modal dataset with heterogeneous radars, FMCW LiDAR, IMU, GPS, and cameras. This is the first dataset to integrate 4D radar and spinning radar alongside FMCW LiDAR, offering unparalleled localization, mapping, and place recognition capabilities. The dataset covers diverse weather and lighting conditions and a range of urban traffic scenarios, enabling a comprehensive analysis across various environments. The sequence paths with multiple revisits and ground truth pose for each sensor enhance its suitability for place recognition research. We expect the HeRCULES dataset to facilitate odometry, mapping, place recognition, and sensor fusion research. The dataset and development tools are available at https://sites.google.com/view/herculesdataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01946v1-abstract-full').style.display = 'none'; document.getElementById('2502.01946v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2025 IEEE International Conference on Robotics and Automation (ICRA 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01458">arXiv:2502.01458</a> <span> [<a href="https://arxiv.org/pdf/2502.01458">pdf</a>, <a href="https://arxiv.org/format/2502.01458">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Understanding the Capabilities and Limitations of Weak-to-Strong Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+W">Wei Yao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenkai Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqiao Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yankai Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01458v1-abstract-short" style="display: inline;"> Weak-to-strong generalization, where weakly supervised strong models outperform their weaker teachers, offers a promising approach to aligning superhuman models with human values. To deepen the understanding of this approach, we provide theoretical insights into its capabilities and limitations. First, in the classification setting, we establish upper and lower generalization error bounds for the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01458v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01458v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01458v1-abstract-full" style="display: none;"> Weak-to-strong generalization, where weakly supervised strong models outperform their weaker teachers, offers a promising approach to aligning superhuman models with human values. To deepen the understanding of this approach, we provide theoretical insights into its capabilities and limitations. First, in the classification setting, we establish upper and lower generalization error bounds for the strong model, identifying the primary limitations as stemming from the weak model's generalization error and the optimization objective itself. Additionally, we derive lower and upper bounds on the calibration error of the strong model. These theoretical bounds reveal two critical insights: (1) the weak model should demonstrate strong generalization performance and maintain well-calibrated predictions, and (2) the strong model's training process must strike a careful balance, as excessive optimization could undermine its generalization capability by over-relying on the weak supervision signals. Finally, in the regression setting, we extend the work of Charikar et al. (2024) to a loss function based on Kullback-Leibler (KL) divergence, offering guarantees that the strong student can outperform its weak teacher by at least the magnitude of their disagreement. We conduct sufficient experiments to validate our theory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01458v1-abstract-full').style.display = 'none'; document.getElementById('2502.01458v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00261">arXiv:2502.00261</a> <span> [<a href="https://arxiv.org/pdf/2502.00261">pdf</a>, <a href="https://arxiv.org/format/2502.00261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Alternative Mixed Integer Linear Programming Optimization for Joint Job Scheduling and Data Allocation in Grid Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shengyu Feng</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jaehyung Kim</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiming Yang</a>, <a href="/search/cs?searchtype=author&query=Boudreau%2C+J">Joseph Boudreau</a>, <a href="/search/cs?searchtype=author&query=Chowdhury%2C+T">Tasnuva Chowdhury</a>, <a href="/search/cs?searchtype=author&query=Hoisie%2C+A">Adolfy Hoisie</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+R">Raees Khan</a>, <a href="/search/cs?searchtype=author&query=Kilic%2C+O+O">Ozgur O. Kilic</a>, <a href="/search/cs?searchtype=author&query=Klasky%2C+S">Scott Klasky</a>, <a href="/search/cs?searchtype=author&query=Korchuganova%2C+T">Tatiana Korchuganova</a>, <a href="/search/cs?searchtype=author&query=Nilsson%2C+P">Paul Nilsson</a>, <a href="/search/cs?searchtype=author&query=Outschoorn%2C+V+I+M">Verena Ingrid Martinez Outschoorn</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+K">David K. Park</a>, <a href="/search/cs?searchtype=author&query=Podhorszki%2C+N">Norbert Podhorszki</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yihui Ren</a>, <a href="/search/cs?searchtype=author&query=Suter%2C+F">Frederic Suter</a>, <a href="/search/cs?searchtype=author&query=Vatsavai%2C+S+S">Sairam Sri Vatsavai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+S">Shinjae Yoo</a>, <a href="/search/cs?searchtype=author&query=Maeno%2C+T">Tadashi Maeno</a>, <a href="/search/cs?searchtype=author&query=Klimentov%2C+A">Alexei Klimentov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00261v1-abstract-short" style="display: inline;"> This paper presents a novel approach to the joint optimization of job scheduling and data allocation in grid computing environments. We formulate this joint optimization problem as a mixed integer quadratically constrained program. To tackle the nonlinearity in the constraint, we alternatively fix a subset of decision variables and optimize the remaining ones via Mixed Integer Linear Programming (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00261v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00261v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00261v1-abstract-full" style="display: none;"> This paper presents a novel approach to the joint optimization of job scheduling and data allocation in grid computing environments. We formulate this joint optimization problem as a mixed integer quadratically constrained program. To tackle the nonlinearity in the constraint, we alternatively fix a subset of decision variables and optimize the remaining ones via Mixed Integer Linear Programming (MILP). We solve the MILP problem at each iteration via an off-the-shelf MILP solver. Our experimental results show that our method significantly outperforms existing heuristic methods, employing either independent optimization or joint optimization strategies. We have also verified the generalization ability of our method over grid environments with various sizes and its high robustness to the algorithm hyper-parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00261v1-abstract-full').style.display = 'none'; document.getElementById('2502.00261v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18934">arXiv:2501.18934</a> <span> [<a href="https://arxiv.org/pdf/2501.18934">pdf</a>, <a href="https://arxiv.org/format/2501.18934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning Model Inversion Attacks and Defenses: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wencheng Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Taotao Cai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanming Zhu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+S">Shicheng Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiying Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18934v1-abstract-short" style="display: inline;"> The rapid adoption of deep learning in sensitive domains has brought tremendous benefits. However, this widespread adoption has also given rise to serious vulnerabilities, particularly model inversion (MI) attacks, posing a significant threat to the privacy and integrity of personal data. The increasing prevalence of these attacks in applications such as biometrics, healthcare, and finance has cre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18934v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18934v1-abstract-full" style="display: none;"> The rapid adoption of deep learning in sensitive domains has brought tremendous benefits. However, this widespread adoption has also given rise to serious vulnerabilities, particularly model inversion (MI) attacks, posing a significant threat to the privacy and integrity of personal data. The increasing prevalence of these attacks in applications such as biometrics, healthcare, and finance has created an urgent need to understand their mechanisms, impacts, and defense methods. This survey aims to fill the gap in the literature by providing a structured and in-depth review of MI attacks and defense strategies. Our contributions include a systematic taxonomy of MI attacks, extensive research on attack techniques and defense mechanisms, and a discussion about the challenges and future research directions in this evolving field. By exploring the technical and ethical implications of MI attacks, this survey aims to offer insights into the impact of AI-powered systems on privacy, security, and trust. In conjunction with this survey, we have developed a comprehensive repository to support research on MI attacks and defenses. The repository includes state-of-the-art research papers, datasets, evaluation metrics, and other resources to meet the needs of both novice and experienced researchers interested in MI attacks and defenses, as well as the broader field of AI security and privacy. The repository will be continuously maintained to ensure its relevance and utility. It is accessible at https://github.com/overgter/Deep-Learning-Model-Inversion-Attacks-and-Defenses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18934v1-abstract-full').style.display = 'none'; document.getElementById('2501.18934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 figures, 56 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16605">arXiv:2501.16605</a> <span> [<a href="https://arxiv.org/pdf/2501.16605">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Impact and influence of modern AI in metadata management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenli Yang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+R">Rui Fu</a>, <a href="/search/cs?searchtype=author&query=Amin%2C+M+B">Muhammad Bilal Amin</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+B">Byeong Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16605v1-abstract-short" style="display: inline;"> Metadata management plays a critical role in data governance, resource discovery, and decision-making in the data-driven era. While traditional metadata approaches have primarily focused on organization, classification, and resource reuse, the integration of modern artificial intelligence (AI) technologies has significantly transformed these processes. This paper investigates both traditional and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16605v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16605v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16605v1-abstract-full" style="display: none;"> Metadata management plays a critical role in data governance, resource discovery, and decision-making in the data-driven era. While traditional metadata approaches have primarily focused on organization, classification, and resource reuse, the integration of modern artificial intelligence (AI) technologies has significantly transformed these processes. This paper investigates both traditional and AI-driven metadata approaches by examining open-source solutions, commercial tools, and research initiatives. A comparative analysis of traditional and AI-driven metadata management methods is provided, highlighting existing challenges and their impact on next-generation datasets. The paper also presents an innovative AI-assisted metadata management framework designed to address these challenges. This framework leverages more advanced modern AI technologies to automate metadata generation, enhance governance, and improve the accessibility and usability of modern datasets. Finally, the paper outlines future directions for research and development, proposing opportunities to further advance metadata management in the context of AI-driven innovation and complex datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16605v1-abstract-full').style.display = 'none'; document.getElementById('2501.16605v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16398">arXiv:2501.16398</a> <span> [<a href="https://arxiv.org/pdf/2501.16398">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atomic Physics">physics.atom-ph</span> </div> </div> <p class="title is-5 mathjax"> Visualizing the Local Atomic Environment Features of Machine Learning Interatomic Potential </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+X">Xuqiang Shao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tianxiang Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Zhiran Gan</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+F">Fanshun Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Weijie Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16398v1-abstract-short" style="display: inline;"> This paper addresses the challenges of creating efficient and high-quality datasets for machine learning potential functions. We present a novel approach, termed DV-LAE (Difference Vectors based on Local Atomic Environments), which utilizes the properties of atomic local environments and employs histogram statistics to generate difference vectors. This technique facilitates dataset screening and o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16398v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16398v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16398v1-abstract-full" style="display: none;"> This paper addresses the challenges of creating efficient and high-quality datasets for machine learning potential functions. We present a novel approach, termed DV-LAE (Difference Vectors based on Local Atomic Environments), which utilizes the properties of atomic local environments and employs histogram statistics to generate difference vectors. This technique facilitates dataset screening and optimization, effectively minimizing redundancy while maintaining data diversity. We have validated the optimized datasets in high-temperature and high-pressure hydrogen systems as well as the 伪-Fe/H binary system, demonstrating a significant reduction in computational resource usage without compromising prediction accuracy. Additionally, our method has revealed new structures that emerge during simulations but were underrepresented in the initial training datasets. The redundancy in the datasets and the distribution of these new structures can be visually analyzed through the visualization of difference vectors. This approach enhances our understanding of the characteristics of these newly formed structures and their impact on physical processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16398v1-abstract-full').style.display = 'none'; document.getElementById('2501.16398v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15804">arXiv:2501.15804</a> <span> [<a href="https://arxiv.org/pdf/2501.15804">pdf</a>, <a href="https://arxiv.org/format/2501.15804">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> CodeImprove: Program Adaptation for Deep Code </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rathnasuriya%2C+R">Ravishka Rathnasuriya</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zijie Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15804v1-abstract-short" style="display: inline;"> Leveraging deep learning (DL)-based code analysis tools to solve software engineering tasks is becoming increasingly popular. Code models often suffer performance degradation due to various reasons (e.g., code data shifts). Retraining is often required to address these issues, but frequent model updates are costly in labeling and deployment. In this paper, we explore an alternative solution: Adapt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15804v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15804v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15804v1-abstract-full" style="display: none;"> Leveraging deep learning (DL)-based code analysis tools to solve software engineering tasks is becoming increasingly popular. Code models often suffer performance degradation due to various reasons (e.g., code data shifts). Retraining is often required to address these issues, but frequent model updates are costly in labeling and deployment. In this paper, we explore an alternative solution: Adapting the program inputs to the code models. This can be achieved by two steps: 1) input validation that focuses on identifying whether an input is an out-of-scope input program that are beyond a model's handling capability, and 2) input adaptation that adapts out-of-scope inputs to become in-scope inputs. Validating program input is challenging, as current techniques focus on continuous inputs such as image data and fail with discrete inputs like code data, which have unique characteristics and are processed differently by deep learning models. Adapting out-of-scope programs is also challenging due to their vast search spaces. Therefore, in this paper, we propose CodeImprove, which distinguishes out-of-scope from normal inputs and converts such out-of-scope inputs back to in-scope inputs through program transformation. In particular, we propose a validity score metric to identify out-of-scope inputs and leverage genetic algorithms to apply semantic preserving program transformation to convert out-of-scope inputs to in-scope inputs. Our experimental results show CodeImprove can enhance up to 8.78% of accuracy, and 51.28% of relative improvements in three code models on two SE tasks. Additionally, our input validation is promising in detecting out-of-scope inputs (AUC score of 0.924). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15804v1-abstract-full').style.display = 'none'; document.getElementById('2501.15804v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proceedings of the 47th IEEE/ACM International Conference on Software Engineering (ICSE 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13947">arXiv:2501.13947</a> <span> [<a href="https://arxiv.org/pdf/2501.13947">pdf</a>, <a href="https://arxiv.org/format/2501.13947">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey on Integrating Large Language Models with Knowledge-Based Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Some%2C+L">Lilian Some</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenli Yang</a>, <a href="/search/cs?searchtype=author&query=Bain%2C+M">Michael Bain</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+B">Byeong Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13947v1-abstract-short" style="display: inline;"> The rapid development of artificial intelligence has brought about substantial advancements in the field. One promising direction is the integration of Large Language Models (LLMs) with structured knowledge-based systems. This approach aims to enhance AI capabilities by combining the generative language understanding of LLMs with the precise knowledge representation of structured systems. This sur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13947v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13947v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13947v1-abstract-full" style="display: none;"> The rapid development of artificial intelligence has brought about substantial advancements in the field. One promising direction is the integration of Large Language Models (LLMs) with structured knowledge-based systems. This approach aims to enhance AI capabilities by combining the generative language understanding of LLMs with the precise knowledge representation of structured systems. This survey explores the synergy between LLMs and knowledge bases, focusing on real-world applications and addressing associated technical, operational, and ethical challenges. Through a comprehensive literature review, the study identifies critical issues and evaluates existing solutions. The paper highlights the benefits of integrating generative AI with knowledge bases, including improved data contextualization, enhanced model accuracy, and better utilization of knowledge resources. The findings provide a detailed overview of the current state of research, identify key gaps, and offer actionable recommendations. These insights contribute to advancing AI technologies and support their practical deployment across various sectors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13947v1-abstract-full').style.display = 'none'; document.getElementById('2501.13947v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13354">arXiv:2501.13354</a> <span> [<a href="https://arxiv.org/pdf/2501.13354">pdf</a>, <a href="https://arxiv.org/format/2501.13354">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NUDT4MSTAR: A Large Dataset and Benchmark Towards Remote Sensing Object Recognition in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weijie Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+X">Xuying Xiong</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Bowen Peng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yafei Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianpeng Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhen Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13354v2-abstract-short" style="display: inline;"> As an indispensable sensor for Remote sensing, Synthetic Aperture Radar (SAR) has a unique capability for all-day imaging. Nevertheless, in a data-driven era, the scarcity of large-scale datasets poses a significant bottleneck to advancing SAR automatic target recognition (ATR) technology. This paper introduces NUDT4MSTAR, a large-scale SAR dataset for remote sensing target recognition in the wild… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13354v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13354v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13354v2-abstract-full" style="display: none;"> As an indispensable sensor for Remote sensing, Synthetic Aperture Radar (SAR) has a unique capability for all-day imaging. Nevertheless, in a data-driven era, the scarcity of large-scale datasets poses a significant bottleneck to advancing SAR automatic target recognition (ATR) technology. This paper introduces NUDT4MSTAR, a large-scale SAR dataset for remote sensing target recognition in the wild, including 40 vehicle target types and various imaging conditions across 5 realistic scenes. NUDT4MSTAR represents a significant leap forward in dataset scale, containing over 190,000 images-tenfold the size of its predecessors. We meticulously annotate each image with detailed target information and imaging conditions. Besides, data in both processed magnitude images and original complex formats are provided. Then, we construct a comprehensive benchmark consisting of 7 experiments with 15 recognition methods focusing on the stable and effective ATR issues. Besides, we conduct transfer learning experiments utilizing various models training on NUDT4MSTAR and apply them to three other target datasets, demonstrating its substantial potential for the broader field of ground objects ATR. Finally, we discuss this dataset's application value and ATR's significant challenges. To the best of our knowledge, this work marks the first-ever endeavor to create a large-scale dataset benchmark for fine-grained SAR recognition in the wild, featuring an extensive collection of exhaustively annotated vehicle images. We expect that the open source of NUDT4MSTAR will facilitate the development of SAR ATR and attract a wider community of researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13354v2-abstract-full').style.display = 'none'; document.getElementById('2501.13354v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 14 figures; NUDT4MSTAR: https://github.com/waterdisappear/NUDT4MSTAR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13349">arXiv:2501.13349</a> <span> [<a href="https://arxiv.org/pdf/2501.13349">pdf</a>, <a href="https://arxiv.org/format/2501.13349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MSF: Efficient Diffusion Model Via Multi-Scale Latent Factorize </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haohang Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Longyu Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shuangrui Ding</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yilin Gao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Dongsheng Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yin Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Junqing Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13349v1-abstract-short" style="display: inline;"> Diffusion-based generative models have achieved remarkable progress in visual content generation. However, traditional diffusion models directly denoise the entire image from noisy inputs, disregarding the hierarchical structure present in visual signals. This method is computationally intensive, especially for high-resolution image generation. Signal processing often leverages hierarchical decomp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13349v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13349v1-abstract-full" style="display: none;"> Diffusion-based generative models have achieved remarkable progress in visual content generation. However, traditional diffusion models directly denoise the entire image from noisy inputs, disregarding the hierarchical structure present in visual signals. This method is computationally intensive, especially for high-resolution image generation. Signal processing often leverages hierarchical decompositions; for instance, Fourier analysis decomposes signals by frequency, while wavelet analysis captures localized frequency components, reflecting both spatial and frequency information simultaneously. Inspired by these principles, we propose a multiscale diffusion framework that generates hierarchical visual representations, which are subsequently integrated to form the final output. The diffusion model target, whether raw RGB pixels or latent features from a Variational Autoencoder, s divided into multiple components that each capture distinct spatial levels. The low-resolution component contains the primary informative signal, while higher-resolution components add high-frequency details, such as texture. This approach divides image generation into two stages: producing a low-resolution base signal, followed by a high-resolution residual signal. Both stages can be effectively modeled using simpler, lightweight transformer architectures compared to full-resolution generation. This decomposition is conceptually similar to wavelet decomposition but offers a more streamlined and intuitive design. Our method, termed MSF(short for Multi-Scale Factorization), achieves an FID of 2.2 and an IS of 255.4 on the ImageNet 256x256 benchmark, reducing computational costs by 50% compared to baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13349v1-abstract-full').style.display = 'none'; document.getElementById('2501.13349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11203">arXiv:2501.11203</a> <span> [<a href="https://arxiv.org/pdf/2501.11203">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Advancing Oyster Phenotype Segmentation with Multi-Network Ensemble and Multi-Scale mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenli Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanyu Chen</a>, <a href="/search/cs?searchtype=author&query=Trotter%2C+A">Andrew Trotter</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+B">Byeong Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11203v1-abstract-short" style="display: inline;"> Phenotype segmentation is pivotal in analysing visual features of living organisms, enhancing our understanding of their characteristics. In the context of oysters, meat quality assessment is paramount, focusing on shell, meat, gonad, and muscle components. Traditional manual inspection methods are time-consuming and subjective, prompting the adoption of machine vision technology for efficient and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11203v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11203v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11203v1-abstract-full" style="display: none;"> Phenotype segmentation is pivotal in analysing visual features of living organisms, enhancing our understanding of their characteristics. In the context of oysters, meat quality assessment is paramount, focusing on shell, meat, gonad, and muscle components. Traditional manual inspection methods are time-consuming and subjective, prompting the adoption of machine vision technology for efficient and objective evaluation. We explore machine vision's capacity for segmenting oyster components, leading to the development of a multi-network ensemble approach with a global-local hierarchical attention mechanism. This approach integrates predictions from diverse models and addresses challenges posed by varying scales, ensuring robust instance segmentation across components. Finally, we provide a comprehensive evaluation of the proposed method's performance using different real-world datasets, highlighting its efficacy and robustness in enhancing oyster phenotype segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11203v1-abstract-full').style.display = 'none'; document.getElementById('2501.11203v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11031">arXiv:2501.11031</a> <span> [<a href="https://arxiv.org/pdf/2501.11031">pdf</a>, <a href="https://arxiv.org/format/2501.11031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AdaptiveLog: An Adaptive Log Analysis Framework with the Collaboration of Large and Small Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lipeng Ma</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Weidong Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yixuan Li</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+B">Ben Fei</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Mingjie Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuhao Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Sihang Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bo Xu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yanghua Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11031v1-abstract-short" style="display: inline;"> Automated log analysis is crucial to ensure high availability and reliability of complex systems. The advent of LLMs in NLP has ushered in a new era of language model-driven automated log analysis, garnering significant interest. Within this field, two primary paradigms based on language models for log analysis have become prominent. Small Language Models (SLMs) follow the pre-train and fine-tune… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11031v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11031v1-abstract-full" style="display: none;"> Automated log analysis is crucial to ensure high availability and reliability of complex systems. The advent of LLMs in NLP has ushered in a new era of language model-driven automated log analysis, garnering significant interest. Within this field, two primary paradigms based on language models for log analysis have become prominent. Small Language Models (SLMs) follow the pre-train and fine-tune paradigm, focusing on the specific log analysis task through fine-tuning on supervised datasets. On the other hand, LLMs following the in-context learning paradigm, analyze logs by providing a few examples in prompt contexts without updating parameters. Despite their respective strengths, we notice that SLMs are more cost-effective but less powerful, whereas LLMs with large parameters are highly powerful but expensive and inefficient. To trade-off between the performance and inference costs of both models in automated log analysis, this paper introduces an adaptive log analysis framework known as AdaptiveLog, which effectively reduces the costs associated with LLM while ensuring superior results. This framework collaborates an LLM and a small language model, strategically allocating the LLM to tackle complex logs while delegating simpler logs to the SLM. Specifically, to efficiently query the LLM, we propose an adaptive selection strategy based on the uncertainty estimation of the SLM, where the LLM is invoked only when the SLM is uncertain. In addition, to enhance the reasoning ability of the LLM in log analysis tasks, we propose a novel prompt strategy by retrieving similar error-prone cases as the reference, enabling the model to leverage past error experiences and learn solutions from these cases. Extensive experiments demonstrate that AdaptiveLog achieves state-of-the-art results across different tasks, elevating the overall accuracy of log analysis while maintaining cost efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11031v1-abstract-full').style.display = 'none'; document.getElementById('2501.11031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10788">arXiv:2501.10788</a> <span> [<a href="https://arxiv.org/pdf/2501.10788">pdf</a>, <a href="https://arxiv.org/format/2501.10788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Decoupling Appearance Variations with 3D Consistent Features in Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiaqi Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhihao Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Binxiao Huang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiao Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianzhuang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shiyong Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaofei Wu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+F">Fenglong Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenming Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10788v1-abstract-short" style="display: inline;"> Gaussian Splatting has emerged as a prominent 3D representation in novel view synthesis, but it still suffers from appearance variations, which are caused by various factors, such as modern camera ISPs, different time of day, weather conditions, and local light changes. These variations can lead to floaters and color distortions in the rendered images/videos. Recent appearance modeling approaches… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10788v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10788v1-abstract-full" style="display: none;"> Gaussian Splatting has emerged as a prominent 3D representation in novel view synthesis, but it still suffers from appearance variations, which are caused by various factors, such as modern camera ISPs, different time of day, weather conditions, and local light changes. These variations can lead to floaters and color distortions in the rendered images/videos. Recent appearance modeling approaches in Gaussian Splatting are either tightly coupled with the rendering process, hindering real-time rendering, or they only account for mild global variations, performing poorly in scenes with local light changes. In this paper, we propose DAVIGS, a method that decouples appearance variations in a plug-and-play and efficient manner. By transforming the rendering results at the image level instead of the Gaussian level, our approach can model appearance variations with minimal optimization time and memory overhead. Furthermore, our method gathers appearance-related information in 3D space to transform the rendered images, thus building 3D consistency across views implicitly. We validate our method on several appearance-variant scenes, and demonstrate that it achieves state-of-the-art rendering quality with minimal training time and memory usage, without compromising rendering speeds. Additionally, it provides performance improvements for different Gaussian Splatting baselines in a plug-and-play manner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10788v1-abstract-full').style.display = 'none'; document.getElementById('2501.10788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI 2025. Project website: https://davi-gaussian.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10404">arXiv:2501.10404</a> <span> [<a href="https://arxiv.org/pdf/2501.10404">pdf</a>, <a href="https://arxiv.org/format/2501.10404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Automated Detection of Epileptic Spikes and Seizures Incorporating a Novel Spatial Clustering Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hanyang Dong</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+S">Shurong Sheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiongfei Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jiahong Gao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yi Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wanli Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+K">Kuntao Xiao</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+P">Pengfei Teng</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+G">Guoming Luan</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Z">Zhao Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10404v1-abstract-short" style="display: inline;"> A Magnetoencephalography (MEG) time-series recording consists of multi-channel signals collected by superconducting sensors, with each signal's intensity reflecting magnetic field changes over time at the sensor location. Automating epileptic MEG spike detection significantly reduces manual assessment time and effort, yielding substantial clinical benefits. Existing research addresses MEG spike de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10404v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10404v1-abstract-full" style="display: none;"> A Magnetoencephalography (MEG) time-series recording consists of multi-channel signals collected by superconducting sensors, with each signal's intensity reflecting magnetic field changes over time at the sensor location. Automating epileptic MEG spike detection significantly reduces manual assessment time and effort, yielding substantial clinical benefits. Existing research addresses MEG spike detection by encoding neural network inputs with signals from all channel within a time segment, followed by classification. However, these methods overlook simultaneous spiking occurred from nearby sensors. We introduce a simple yet effective paradigm that first clusters MEG channels based on their sensor's spatial position. Next, a novel convolutional input module is designed to integrate the spatial clustering and temporal changes of the signals. This module is fed into a custom MEEG-ResNet3D developed by the authors, which learns to extract relevant features and classify the input as a spike clip or not. Our method achieves an F1 score of 94.73% on a large real-world MEG dataset Sanbo-CMR collected from two centers, outperforming state-of-the-art approaches by 1.85%. Moreover, it demonstrates efficacy and stability in the Electroencephalographic (EEG) seizure detection task, yielding an improved weighted F1 score of 1.4% compared to current state-of-the-art techniques evaluated on TUSZ, whch is the largest EEG seizure dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10404v1-abstract-full').style.display = 'none'; document.getElementById('2501.10404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures, accepted by BIBM2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10325">arXiv:2501.10325</a> <span> [<a href="https://arxiv.org/pdf/2501.10325">pdf</a>, <a href="https://arxiv.org/format/2501.10325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiffStereo: High-Frequency Aware Diffusion Model for Stereo Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+H">Huiyun Cao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuan Shi</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+B">Bin Xia</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xiaoyu Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenming Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10325v1-abstract-short" style="display: inline;"> Diffusion models (DMs) have achieved promising performance in image restoration but haven't been explored for stereo images. The application of DM in stereo image restoration is confronted with a series of challenges. The need to reconstruct two images exacerbates DM's computational cost. Additionally, existing latent DMs usually focus on semantic information and remove high-frequency details as r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10325v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10325v1-abstract-full" style="display: none;"> Diffusion models (DMs) have achieved promising performance in image restoration but haven't been explored for stereo images. The application of DM in stereo image restoration is confronted with a series of challenges. The need to reconstruct two images exacerbates DM's computational cost. Additionally, existing latent DMs usually focus on semantic information and remove high-frequency details as redundancy during latent compression, which is precisely what matters for image restoration. To address the above problems, we propose a high-frequency aware diffusion model, DiffStereo for stereo image restoration as the first attempt at DM in this domain. Specifically, DiffStereo first learns latent high-frequency representations (LHFR) of HQ images. DM is then trained in the learned space to estimate LHFR for stereo images, which are fused into a transformer-based stereo image restoration network providing beneficial high-frequency information of corresponding HQ images. The resolution of LHFR is kept the same as input images, which preserves the inherent texture from distortion. And the compression in channels alleviates the computational burden of DM. Furthermore, we devise a position encoding scheme when integrating the LHFR into the restoration network, enabling distinctive guidance in different depths of the restoration network. Comprehensive experiments verify that by combining generative DM and transformer, DiffStereo achieves both higher reconstruction accuracy and better perceptual quality on stereo super-resolution, deblurring, and low-light enhancement compared with state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10325v1-abstract-full').style.display = 'none'; document.getElementById('2501.10325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09341">arXiv:2501.09341</a> <span> [<a href="https://arxiv.org/pdf/2501.09341">pdf</a>, <a href="https://arxiv.org/format/2501.09341">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SE-BSFV: Online Subspace Learning based Shadow Enhancement and Background Suppression for ViSAR under Complex Background </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shangqu Yan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chenyang Luo</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yaowen Fu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruofeng Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09341v1-abstract-short" style="display: inline;"> Video synthetic aperture radar (ViSAR) has attracted substantial attention in the moving target detection (MTD) field due to its ability to continuously monitor changes in the target area. In ViSAR, the moving targets' shadows will not offset and defocus, which is widely used as a feature for MTD. However, the shadows are difficult to distinguish from the low scattering region in the background, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09341v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09341v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09341v1-abstract-full" style="display: none;"> Video synthetic aperture radar (ViSAR) has attracted substantial attention in the moving target detection (MTD) field due to its ability to continuously monitor changes in the target area. In ViSAR, the moving targets' shadows will not offset and defocus, which is widely used as a feature for MTD. However, the shadows are difficult to distinguish from the low scattering region in the background, which will cause more missing and false alarms. Therefore, it is worth investigating how to enhance the distinction between the shadows and background. In this study, we proposed the Shadow Enhancement and Background Suppression for ViSAR (SE-BSFV) algorithm. The SE-BSFV algorithm is based on the low-rank representation (LRR) theory and adopts online subspace learning technique to enhance shadows and suppress background for ViSAR images. Firstly, we use a registration algorithm to register the ViSAR images and utilize Gaussian mixture distribution (GMD) to model the ViSAR data. Secondly, the knowledge learned from the previous frames is leveraged to estimate the GMD parameters of the current frame, and the Expectation-maximization (EM) algorithm is used to estimate the subspace parameters. Then, the foreground matrix of the current frame can be obtained. Finally, the alternating direction method of multipliers (ADMM) is used to eliminate strong scattering objects in the foreground matrix to obtain the final results. The experimental results indicate that the SE-BSFV algorithm significantly enhances the shadows' saliency and greatly improves the detection performance while ensuring efficiency compared with several other advanced pre-processing algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09341v1-abstract-full').style.display = 'none'; document.getElementById('2501.09341v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08305">arXiv:2501.08305</a> <span> [<a href="https://arxiv.org/pdf/2501.08305">pdf</a>, <a href="https://arxiv.org/format/2501.08305">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Graph Representations and Graph Neural Networks for Multivariate Time Series Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wennuo Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shiling Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuzhi Zhou</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Cheng Luo</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xilin He</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Weicheng Xie</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Linlin Shen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Siyang Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08305v2-abstract-short" style="display: inline;"> Multivariate Time Series Classification (MTSC) enables the analysis if complex temporal data, and thus serves as a cornerstone in various real-world applications, ranging from healthcare to finance. Since the relationship among variables in MTS usually contain crucial cues, a large number of graph-based MTSC approaches have been proposed, as the graph topology and edges can explicitly represent re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08305v2-abstract-full').style.display = 'inline'; document.getElementById('2501.08305v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08305v2-abstract-full" style="display: none;"> Multivariate Time Series Classification (MTSC) enables the analysis if complex temporal data, and thus serves as a cornerstone in various real-world applications, ranging from healthcare to finance. Since the relationship among variables in MTS usually contain crucial cues, a large number of graph-based MTSC approaches have been proposed, as the graph topology and edges can explicitly represent relationships among variables (channels), where not only various MTS graph representation learning strategies but also different Graph Neural Networks (GNNs) have been explored. Despite such progresses, there is no comprehensive study that fairly benchmarks and investigates the performances of existing widely-used graph representation learning strategies/GNN classifiers in the application of different MTSC tasks. In this paper, we present the first benchmark which systematically investigates the effectiveness of the widely-used three node feature definition strategies, four edge feature learning strategies and five GNN architecture, resulting in 60 different variants for graph-based MTSC. These variants are developed and evaluated with a standardized data pipeline and training/validation/testing strategy on 26 widely-used suspensor MTSC datasets. Our experiments highlight that node features significantly influence MTSC performance, while the visualization of edge features illustrates why adaptive edge learning outperforms other edge feature learning methods. The code of the proposed benchmark is publicly available at \url{https://github.com/CVI-yangwn/Benchmark-GNN-for-Multivariate-Time-Series-Classification}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08305v2-abstract-full').style.display = 'none'; document.getElementById('2501.08305v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07320">arXiv:2501.07320</a> <span> [<a href="https://arxiv.org/pdf/2501.07320">pdf</a>, <a href="https://arxiv.org/format/2501.07320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> ChartEditor: A Human-AI Paired Tool for Authoring Pictorial Charts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+S">Siyu Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tiancheng Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Weikai Yang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+N">Nan Tang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuyu Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07320v1-abstract-short" style="display: inline;"> Pictorial charts are favored for their memorability and visual appeal, offering a more engaging alternative to basic charts. However, their creation can be complex and time-consuming due to the lack of native support in popular visualization tools like Tableau. While AI-generated content (AIGC) tools have lowered the barrier to creating pictorial charts, they often lack precise design control. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07320v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07320v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07320v1-abstract-full" style="display: none;"> Pictorial charts are favored for their memorability and visual appeal, offering a more engaging alternative to basic charts. However, their creation can be complex and time-consuming due to the lack of native support in popular visualization tools like Tableau. While AI-generated content (AIGC) tools have lowered the barrier to creating pictorial charts, they often lack precise design control. To address this issue, we introduce ChartEditor, a human-AI paired tool that transforms basic charts into pictorial versions based on user intent. ChartEditor decomposes chart images into visual components and organizes them within a hierarchical tree. Based on this tree, users can express their intent in natural language, which is then translated into modifications to the hierarchy. In addition, users can directly interact with and modify specific chart components via an intuitive interface to achieve fine-grained design control. A user study demonstrates the effectiveness and usability of ChartEditor in simplifying the creation of pictorial charts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07320v1-abstract-full').style.display = 'none'; document.getElementById('2501.07320v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06710">arXiv:2501.06710</a> <span> [<a href="https://arxiv.org/pdf/2501.06710">pdf</a>, <a href="https://arxiv.org/format/2501.06710">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-task Visual Grounding with Coarse-to-Fine Consistency Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dai%2C+M">Ming Dai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+J">Jiedong Zhuang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xian Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wankou Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06710v1-abstract-short" style="display: inline;"> Multi-task visual grounding involves the simultaneous execution of localization and segmentation in images based on textual expressions. The majority of advanced methods predominantly focus on transformer-based multimodal fusion, aiming to extract robust multimodal representations. However, ambiguity between referring expression comprehension (REC) and referring image segmentation (RIS) is error-p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06710v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06710v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06710v1-abstract-full" style="display: none;"> Multi-task visual grounding involves the simultaneous execution of localization and segmentation in images based on textual expressions. The majority of advanced methods predominantly focus on transformer-based multimodal fusion, aiming to extract robust multimodal representations. However, ambiguity between referring expression comprehension (REC) and referring image segmentation (RIS) is error-prone, leading to inconsistencies between multi-task predictions. Besides, insufficient multimodal understanding directly contributes to biased target perception. To overcome these challenges, we propose a Coarse-to-fine Consistency Constraints Visual Grounding architecture ($\text{C}^3\text{VG}$), which integrates implicit and explicit modeling approaches within a two-stage framework. Initially, query and pixel decoders are employed to generate preliminary detection and segmentation outputs, a process referred to as the Rough Semantic Perception (RSP) stage. These coarse predictions are subsequently refined through the proposed Mask-guided Interaction Module (MIM) and a novel explicit bidirectional consistency constraint loss to ensure consistent representations across tasks, which we term the Refined Consistency Interaction (RCI) stage. Furthermore, to address the challenge of insufficient multimodal understanding, we leverage pre-trained models based on visual-linguistic fusion representations. Empirical evaluations on the RefCOCO, RefCOCO+, and RefCOCOg datasets demonstrate the efficacy and soundness of $\text{C}^3\text{VG}$, which significantly outperforms state-of-the-art REC and RIS methods by a substantial margin. Code and model will be available at \url{https://github.com/Dmmm1997/C3VG}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06710v1-abstract-full').style.display = 'none'; document.getElementById('2501.06710v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06585">arXiv:2501.06585</a> <span> [<a href="https://arxiv.org/pdf/2501.06585">pdf</a>, <a href="https://arxiv.org/format/2501.06585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.knosys.2024.112917">10.1016/j.knosys.2024.112917 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Boundary-enhanced time series data imputation with long-term dependency diffusion models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chunjing Xiao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xue Jiang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xianghe Du</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wei Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaomin Wang</a>, <a href="/search/cs?searchtype=author&query=Chetty%2C+K">Kevin Chetty</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06585v1-abstract-short" style="display: inline;"> Data imputation is crucial for addressing challenges posed by missing values in multivariate time series data across various fields, such as healthcare, traffic, and economics, and has garnered significant attention. Among various methods, diffusion model-based approaches show notable performance improvements. However, existing methods often cause disharmonious boundaries between missing and known… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06585v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06585v1-abstract-full" style="display: none;"> Data imputation is crucial for addressing challenges posed by missing values in multivariate time series data across various fields, such as healthcare, traffic, and economics, and has garnered significant attention. Among various methods, diffusion model-based approaches show notable performance improvements. However, existing methods often cause disharmonious boundaries between missing and known regions and overlook long-range dependencies in missing data estimation, leading to suboptimal results. To address these issues, we propose a Diffusion-based time Series Data Imputation (DSDI) framework. We develop a weight-reducing injection strategy that incorporates the predicted values of missing points with reducing weights into the reverse diffusion process to mitigate boundary inconsistencies. Further, we introduce a multi-scale S4-based U-Net, which combines hierarchical information from different levels via multi-resolution integration to capture long-term dependencies. Experimental results demonstrate that our model outperforms existing imputation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06585v1-abstract-full').style.display = 'none'; document.getElementById('2501.06585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Knowledge-Based Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04579">arXiv:2501.04579</a> <span> [<a href="https://arxiv.org/pdf/2501.04579">pdf</a>, <a href="https://arxiv.org/format/2501.04579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Unified Coding for Both Human Perception and Generalized Machine Analytics with CLIP Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+K">Kangsheng Yin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xuelin Shen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yulin He</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04579v1-abstract-short" style="display: inline;"> The image compression model has long struggled with adaptability and generalization, as the decoded bitstream typically serves only human or machine needs and fails to preserve information for unseen visual tasks. Therefore, this paper innovatively introduces supervision obtained from multimodal pre-training models and incorporates adaptive multi-objective optimization tailored to support both hum… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04579v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04579v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04579v1-abstract-full" style="display: none;"> The image compression model has long struggled with adaptability and generalization, as the decoded bitstream typically serves only human or machine needs and fails to preserve information for unseen visual tasks. Therefore, this paper innovatively introduces supervision obtained from multimodal pre-training models and incorporates adaptive multi-objective optimization tailored to support both human visual perception and machine vision simultaneously with a single bitstream, denoted as Unified and Generalized Image Coding for Machine (UG-ICM). Specifically, to get rid of the reliance between compression models with downstream task supervision, we introduce Contrastive Language-Image Pre-training (CLIP) models into the training constraint for improved generalization. Global-to-instance-wise CLIP supervision is applied to help obtain hierarchical semantics that make models more generalizable for the tasks relying on the information of different granularity. Furthermore, for supporting both human and machine visions with only a unifying bitstream, we incorporate a conditional decoding strategy that takes as conditions human or machine preferences, enabling the bitstream to be decoded into different versions for corresponding preferences. As such, our proposed UG-ICM is fully trained in a self-supervised manner, i.e., without awareness of any specific downstream models and tasks. The extensive experiments have shown that the proposed UG-ICM is capable of achieving remarkable improvements in various unseen machine analytics tasks, while simultaneously providing perceptually satisfying images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04579v1-abstract-full').style.display = 'none'; document.getElementById('2501.04579v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 10 figures, publised to AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04306">arXiv:2501.04306</a> <span> [<a href="https://arxiv.org/pdf/2501.04306">pdf</a>, <a href="https://arxiv.org/format/2501.04306">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> </div> </div> <p class="title is-5 mathjax"> LLM4SR: A Survey on Large Language Models for Scientific Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Ziming Luo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zonglin Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexin Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinya Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04306v1-abstract-short" style="display: inline;"> In recent years, the rapid advancement of Large Language Models (LLMs) has transformed the landscape of scientific research, offering unprecedented support across various stages of the research cycle. This paper presents the first systematic survey dedicated to exploring how LLMs are revolutionizing the scientific research process. We analyze the unique roles LLMs play across four critical stages… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04306v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04306v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04306v1-abstract-full" style="display: none;"> In recent years, the rapid advancement of Large Language Models (LLMs) has transformed the landscape of scientific research, offering unprecedented support across various stages of the research cycle. This paper presents the first systematic survey dedicated to exploring how LLMs are revolutionizing the scientific research process. We analyze the unique roles LLMs play across four critical stages of research: hypothesis discovery, experiment planning and implementation, scientific writing, and peer reviewing. Our review comprehensively showcases the task-specific methodologies and evaluation benchmarks. By identifying current challenges and proposing future research directions, this survey not only highlights the transformative potential of LLMs, but also aims to inspire and guide researchers and practitioners in leveraging LLMs to advance scientific inquiry. Resources are available at the following repository: https://github.com/du-nlp-lab/LLM4SR <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04306v1-abstract-full').style.display = 'none'; document.getElementById('2501.04306v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03832">arXiv:2501.03832</a> <span> [<a href="https://arxiv.org/pdf/2501.03832">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Three-dimensional attention Transformer for state evaluation in real-time strategy games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yanqing Ye</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Weilong Yang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+K">Kai Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03832v1-abstract-short" style="display: inline;"> Situation assessment in Real-Time Strategy (RTS) games is crucial for understanding decision-making in complex adversarial environments. However, existing methods remain limited in processing multi-dimensional feature information and temporal dependencies. Here we propose a tri-dimensional Space-Time-Feature Transformer (TSTF Transformer) architecture, which efficiently models battlefield situatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03832v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03832v1-abstract-full" style="display: none;"> Situation assessment in Real-Time Strategy (RTS) games is crucial for understanding decision-making in complex adversarial environments. However, existing methods remain limited in processing multi-dimensional feature information and temporal dependencies. Here we propose a tri-dimensional Space-Time-Feature Transformer (TSTF Transformer) architecture, which efficiently models battlefield situations through three independent but cascaded modules: spatial attention, temporal attention, and feature attention. On a dataset comprising 3,150 adversarial experiments, the 8-layer TSTF Transformer demonstrates superior performance: achieving 58.7% accuracy in the early game (~4% progress), significantly outperforming the conventional Timesformer's 41.8%; reaching 97.6% accuracy in the mid-game (~40% progress) while maintaining low performance variation (standard deviation 0.114). Meanwhile, this architecture requires fewer parameters (4.75M) compared to the baseline model (5.54M). Our study not only provides new insights into situation assessment in RTS games but also presents an innovative paradigm for Transformer-based multi-dimensional temporal modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03832v1-abstract-full').style.display = 'none'; document.getElementById('2501.03832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03824">arXiv:2501.03824</a> <span> [<a href="https://arxiv.org/pdf/2501.03824">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Online Reinforcement Learning-Based Dynamic Adaptive Evaluation Function for Real-Time Strategy Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Weilong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xunyun Liu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yanqing Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03824v1-abstract-short" style="display: inline;"> Effective evaluation of real-time strategy tasks requires adaptive mechanisms to cope with dynamic and unpredictable environments. This study proposes a method to improve evaluation functions for real-time responsiveness to battle-field situation changes, utilizing an online reinforcement learning-based dynam-ic weight adjustment mechanism within the real-time strategy game. Building on traditiona… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03824v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03824v1-abstract-full" style="display: none;"> Effective evaluation of real-time strategy tasks requires adaptive mechanisms to cope with dynamic and unpredictable environments. This study proposes a method to improve evaluation functions for real-time responsiveness to battle-field situation changes, utilizing an online reinforcement learning-based dynam-ic weight adjustment mechanism within the real-time strategy game. Building on traditional static evaluation functions, the method employs gradient descent in online reinforcement learning to update weights dynamically, incorporating weight decay techniques to ensure stability. Additionally, the AdamW optimizer is integrated to adjust the learning rate and decay rate of online reinforcement learning in real time, further reducing the dependency on manual parameter tun-ing. Round-robin competition experiments demonstrate that this method signifi-cantly enhances the application effectiveness of the Lanchester combat model evaluation function, Simple evaluation function, and Simple Sqrt evaluation function in planning algorithms including IDABCD, IDRTMinimax, and Port-folio AI. The method achieves a notable improvement in scores, with the en-hancement becoming more pronounced as the map size increases. Furthermore, the increase in evaluation function computation time induced by this method is kept below 6% for all evaluation functions and planning algorithms. The pro-posed dynamic adaptive evaluation function demonstrates a promising approach for real-time strategy task evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03824v1-abstract-full').style.display = 'none'; document.getElementById('2501.03824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03575">arXiv:2501.03575</a> <span> [<a href="https://arxiv.org/pdf/2501.03575">pdf</a>, <a href="https://arxiv.org/format/2501.03575">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Cosmos World Foundation Model Platform for Physical AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=NVIDIA"> NVIDIA</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+N">Niket Agarwal</a>, <a href="/search/cs?searchtype=author&query=Ali%2C+A">Arslan Ali</a>, <a href="/search/cs?searchtype=author&query=Bala%2C+M">Maciej Bala</a>, <a href="/search/cs?searchtype=author&query=Balaji%2C+Y">Yogesh Balaji</a>, <a href="/search/cs?searchtype=author&query=Barker%2C+E">Erik Barker</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tiffany Cai</a>, <a href="/search/cs?searchtype=author&query=Chattopadhyay%2C+P">Prithvijit Chattopadhyay</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongxin Chen</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yin Cui</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yifan Ding</a>, <a href="/search/cs?searchtype=author&query=Dworakowski%2C+D">Daniel Dworakowski</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jiaojiao Fan</a>, <a href="/search/cs?searchtype=author&query=Fenzi%2C+M">Michele Fenzi</a>, <a href="/search/cs?searchtype=author&query=Ferroni%2C+F">Francesco Ferroni</a>, <a href="/search/cs?searchtype=author&query=Fidler%2C+S">Sanja Fidler</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+D">Dieter Fox</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+S">Songwei Ge</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yunhao Ge</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jinwei Gu</a>, <a href="/search/cs?searchtype=author&query=Gururani%2C+S">Siddharth Gururani</a>, <a href="/search/cs?searchtype=author&query=He%2C+E">Ethan He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiahui Huang</a>, <a href="/search/cs?searchtype=author&query=Huffman%2C+J">Jacob Huffman</a> , et al. (54 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03575v1-abstract-short" style="display: inline;"> Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into cu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03575v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03575v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03575v1-abstract-full" style="display: none;"> Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into customized world models for downstream applications. Our platform covers a video curation pipeline, pre-trained world foundation models, examples of post-training of pre-trained world foundation models, and video tokenizers. To help Physical AI builders solve the most critical problems of our society, we make our platform open-source and our models open-weight with permissive licenses available via https://github.com/NVIDIA/Cosmos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03575v1-abstract-full').style.display = 'none'; document.getElementById('2501.03575v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02863">arXiv:2501.02863</a> <span> [<a href="https://arxiv.org/pdf/2501.02863">pdf</a>, <a href="https://arxiv.org/format/2501.02863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Beyond Pass or Fail: Multi-Dimensional Benchmarking of Foundation Models for Goal-based Mobile UI Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ran%2C+D">Dezhi Ran</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mengzhou Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hao Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuetong Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jun Ren</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xia Zeng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haochuan Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexin Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengqian Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+T">Ting Su</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+L">Liangchao Yao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+T">Ting Xiong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yuetang Deng</a>, <a href="/search/cs?searchtype=author&query=Marron%2C+A">Assaf Marron</a>, <a href="/search/cs?searchtype=author&query=Harel%2C+D">David Harel</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tao Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02863v2-abstract-short" style="display: inline;"> Recent advances of foundation models (FMs) have made navigating mobile applications (apps) based on high-level goal instructions within reach, with significant industrial applications such as UI testing. While existing benchmarks evaluate FM-based UI navigation using the binary pass/fail metric, they have two major limitations: they cannot reflect the complex nature of mobile UI navigation where F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02863v2-abstract-full').style.display = 'inline'; document.getElementById('2501.02863v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02863v2-abstract-full" style="display: none;"> Recent advances of foundation models (FMs) have made navigating mobile applications (apps) based on high-level goal instructions within reach, with significant industrial applications such as UI testing. While existing benchmarks evaluate FM-based UI navigation using the binary pass/fail metric, they have two major limitations: they cannot reflect the complex nature of mobile UI navigation where FMs may fail for various reasons (e.g., misunderstanding instructions and failed planning), and they lack industrial relevance due to oversimplified tasks that poorly represent real-world scenarios. To address the preceding limitations, we propose Sphinx, a comprehensive benchmark for multi-dimensional evaluation of FMs in industrial settings of UI navigation. Sphinx introduces a specialized toolkit that evaluates five essential FM capabilities, providing detailed insights into failure modes such as insufficient app knowledge or planning issues. Using both popular Google Play applications and WeChat's internal UI test cases, we evaluate 8 FMs with 20 different configurations. Our results show that existing FMs universally struggle with goal-based testing tasks, primarily due to insufficient UI-specific capabilities. We summarize seven lessons learned from benchmarking FMs with Sphinx, providing clear directions for improving FM-based mobile UI navigation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02863v2-abstract-full').style.display = 'none'; document.getElementById('2501.02863v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19744">arXiv:2412.19744</a> <span> [<a href="https://arxiv.org/pdf/2412.19744">pdf</a>, <a href="https://arxiv.org/format/2412.19744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> AAM-SEALS: Developing Aerial-Aquatic Manipulators in SEa, Air, and Land Simulator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">William Yang</a>, <a href="/search/cs?searchtype=author&query=Kona%2C+K">Karthikeya Kona</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+Y">Yashveer Jain</a>, <a href="/search/cs?searchtype=author&query=Atzili%2C+T">Tomer Atzili</a>, <a href="/search/cs?searchtype=author&query=Bhamidipati%2C+A">Abhinav Bhamidipati</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiaomin Lin</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Y">Yantian Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19744v3-abstract-short" style="display: inline;"> Current mobile manipulators and high-fidelity simulators lack the ability to seamlessly operate and simulate across integrated environments spanning sea, air, and land. To address this gap, we introduce Aerial-Aquatic Manipulators (AAMs) in SEa, Air, and Land Simulator (SEALS), a comprehensive and photorealistic simulator designed for AAMs to operate and learn in these diverse environments. The de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19744v3-abstract-full').style.display = 'inline'; document.getElementById('2412.19744v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19744v3-abstract-full" style="display: none;"> Current mobile manipulators and high-fidelity simulators lack the ability to seamlessly operate and simulate across integrated environments spanning sea, air, and land. To address this gap, we introduce Aerial-Aquatic Manipulators (AAMs) in SEa, Air, and Land Simulator (SEALS), a comprehensive and photorealistic simulator designed for AAMs to operate and learn in these diverse environments. The development of AAM-SEALS tackles several significant challenges, including the creation of integrated controllers for flying, swimming, and manipulation, and the high-fidelity simulation of aerial dynamics and hydrodynamics leveraging particle-based hydrodynamics. Our evaluation demonstrates smooth operation and photorealistic transitions across air, water, and their interfaces. We quantitatively validate the fidelity of particle-based hydrodynamics by comparing position-tracking errors across real-world and simulated systems. AAM-SEALS benefits a broad range of robotics communities, including robot learning, aerial robotics, underwater robotics, mobile manipulation, and robotic simulators. We will open-source our code and data to foster the advancement of research in these fields. The overview video is available at https://youtu.be/MbqIIrYvR78. Visit our project website at https://aam-seals-v1.umd.edu for more details. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19744v3-abstract-full').style.display = 'none'; document.getElementById('2412.19744v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18703">arXiv:2412.18703</a> <span> [<a href="https://arxiv.org/pdf/2412.18703">pdf</a>, <a href="https://arxiv.org/format/2412.18703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty Quantification in Stereo Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+W">Wenxiao Cai</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Dongting Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+R">Ruoyan Yin</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiankang Deng</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Huan Fu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wankou Yang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+M">Mingming Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18703v1-abstract-short" style="display: inline;"> Stereo matching plays a crucial role in various applications, where understanding uncertainty can enhance both safety and reliability. Despite this, the estimation and analysis of uncertainty in stereo matching have been largely overlooked. Previous works often provide limited interpretations of uncertainty and struggle to separate it effectively into data (aleatoric) and model (epistemic) compone… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18703v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18703v1-abstract-full" style="display: none;"> Stereo matching plays a crucial role in various applications, where understanding uncertainty can enhance both safety and reliability. Despite this, the estimation and analysis of uncertainty in stereo matching have been largely overlooked. Previous works often provide limited interpretations of uncertainty and struggle to separate it effectively into data (aleatoric) and model (epistemic) components. This disentanglement is essential, as it allows for a clearer understanding of the underlying sources of error, enhancing both prediction confidence and decision-making processes. In this paper, we propose a new framework for stereo matching and its uncertainty quantification. We adopt Bayes risk as a measure of uncertainty and estimate data and model uncertainty separately. Experiments are conducted on four stereo benchmarks, and the results demonstrate that our method can estimate uncertainty accurately and efficiently. Furthermore, we apply our uncertainty method to improve prediction accuracy by selecting data points with small uncertainties, which reflects the accuracy of our estimated uncertainty. The codes are publicly available at https://github.com/RussRobin/Uncertainty. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18703v1-abstract-full').style.display = 'none'; document.getElementById('2412.18703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12778">arXiv:2412.12778</a> <span> [<a href="https://arxiv.org/pdf/2412.12778">pdf</a>, <a href="https://arxiv.org/format/2412.12778">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Diffusion-Based Image Generators for Fundus Fluorescein Angiography Synthesis on Limited Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chengzhou Yu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Huihui Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+T">Ting Deng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Q">Qing Du</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yanwu Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Weihua Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12778v1-abstract-short" style="display: inline;"> Fundus imaging is a critical tool in ophthalmology, with different imaging modalities offering unique advantages. For instance, fundus fluorescein angiography (FFA) can accurately identify eye diseases. However, traditional invasive FFA involves the injection of sodium fluorescein, which can cause discomfort and risks. Generating corresponding FFA images from non-invasive fundus images holds signi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12778v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12778v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12778v1-abstract-full" style="display: none;"> Fundus imaging is a critical tool in ophthalmology, with different imaging modalities offering unique advantages. For instance, fundus fluorescein angiography (FFA) can accurately identify eye diseases. However, traditional invasive FFA involves the injection of sodium fluorescein, which can cause discomfort and risks. Generating corresponding FFA images from non-invasive fundus images holds significant practical value but also presents challenges. First, limited datasets constrain the performance and effectiveness of models. Second, previous studies have primarily focused on generating FFA for single diseases or single modalities, often resulting in poor performance for patients with various ophthalmic conditions. To address these issues, we propose a novel latent diffusion model-based framework, Diffusion, which introduces a fine-tuning protocol to overcome the challenge of limited medical data and unleash the generative capabilities of diffusion models. Furthermore, we designed a new approach to tackle the challenges of generating across different modalities and disease types. On limited datasets, our framework achieves state-of-the-art results compared to existing methods, offering significant potential to enhance ophthalmic diagnostics and patient care. Our code will be released soon to support further research in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12778v1-abstract-full').style.display = 'none'; document.getElementById('2412.12778v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12669">arXiv:2412.12669</a> <span> [<a href="https://arxiv.org/pdf/2412.12669">pdf</a>, <a href="https://arxiv.org/format/2412.12669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Prototype Replay for Class Incremental Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guilin Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Dongyue Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changxin Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runmin Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Weidong Yang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+N">Nong Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12669v1-abstract-short" style="display: inline;"> Class incremental semantic segmentation (CISS) aims to segment new classes during continual steps while preventing the forgetting of old knowledge. Existing methods alleviate catastrophic forgetting by replaying distributions of previously learned classes using stored prototypes or features. However, they overlook a critical issue: in CISS, the representation of class knowledge is updated continuo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12669v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12669v1-abstract-full" style="display: none;"> Class incremental semantic segmentation (CISS) aims to segment new classes during continual steps while preventing the forgetting of old knowledge. Existing methods alleviate catastrophic forgetting by replaying distributions of previously learned classes using stored prototypes or features. However, they overlook a critical issue: in CISS, the representation of class knowledge is updated continuously through incremental learning, whereas prototype replay methods maintain fixed prototypes. This mismatch between updated representation and fixed prototypes limits the effectiveness of the prototype replay strategy. To address this issue, we propose the Adaptive prototype replay (Adapter) for CISS in this paper. Adapter comprises an adaptive deviation compen sation (ADC) strategy and an uncertainty-aware constraint (UAC) loss. Specifically, the ADC strategy dynamically updates the stored prototypes based on the estimated representation shift distance to match the updated representation of old class. The UAC loss reduces prediction uncertainty, aggregating discriminative features to aid in generating compact prototypes. Additionally, we introduce a compensation-based prototype similarity discriminative (CPD) loss to ensure adequate differentiation between similar prototypes, thereby enhancing the efficiency of the adaptive prototype replay strategy. Extensive experiments on Pascal VOC and ADE20K datasets demonstrate that Adapter achieves state-of-the-art results and proves effective across various CISS tasks, particularly in challenging multi-step scenarios. The code and model is available at https://github.com/zhu-gl-ux/Adapter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12669v1-abstract-full').style.display = 'none'; document.getElementById('2412.12669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the Main Technical Track of the 39th Annual AAAI Conference on Artificial Intelligence (AAAI-2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11582">arXiv:2412.11582</a> <span> [<a href="https://arxiv.org/pdf/2412.11582">pdf</a>, <a href="https://arxiv.org/format/2412.11582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Oriented Tiny Object Detection: A Dataset, Benchmark, and Dynamic Unbiased Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruixiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wen Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Haoran Zhu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fang Xu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jian Ding</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+G">Gui-Song Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11582v1-abstract-short" style="display: inline;"> Detecting oriented tiny objects, which are limited in appearance information yet prevalent in real-world applications, remains an intricate and under-explored problem. To address this, we systemically introduce a new dataset, benchmark, and a dynamic coarse-to-fine learning scheme in this study. Our proposed dataset, AI-TOD-R, features the smallest object sizes among all oriented object detection… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11582v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11582v1-abstract-full" style="display: none;"> Detecting oriented tiny objects, which are limited in appearance information yet prevalent in real-world applications, remains an intricate and under-explored problem. To address this, we systemically introduce a new dataset, benchmark, and a dynamic coarse-to-fine learning scheme in this study. Our proposed dataset, AI-TOD-R, features the smallest object sizes among all oriented object detection datasets. Based on AI-TOD-R, we present a benchmark spanning a broad range of detection paradigms, including both fully-supervised and label-efficient approaches. Through investigation, we identify a learning bias presents across various learning pipelines: confident objects become increasingly confident, while vulnerable oriented tiny objects are further marginalized, hindering their detection performance. To mitigate this issue, we propose a Dynamic Coarse-to-Fine Learning (DCFL) scheme to achieve unbiased learning. DCFL dynamically updates prior positions to better align with the limited areas of oriented tiny objects, and it assigns samples in a way that balances both quantity and quality across different object shapes, thus mitigating biases in prior settings and sample selection. Extensive experiments across eight challenging object detection datasets demonstrate that DCFL achieves state-of-the-art accuracy, high efficiency, and remarkable versatility. The dataset, benchmark, and code are available at https://chasel-tsui.github.io/AI-TOD-R/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11582v1-abstract-full').style.display = 'none'; document.getElementById('2412.11582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10629">arXiv:2412.10629</a> <span> [<a href="https://arxiv.org/pdf/2412.10629">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rapid Reconstruction of Extremely Accelerated Liver 4D MRI via Chained Iterative Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+D">Di Xu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+X">Xin Miao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hengjie Liu</a>, <a href="/search/cs?searchtype=author&query=Scholey%2C+J+E">Jessica E. Scholey</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wensha Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mary Feng</a>, <a href="/search/cs?searchtype=author&query=Ohliger%2C+M">Michael Ohliger</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/cs?searchtype=author&query=Lao%2C+Y">Yi Lao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+K">Ke Sheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10629v1-abstract-short" style="display: inline;"> Abstract Purpose: High-quality 4D MRI requires an impractically long scanning time for dense k-space signal acquisition covering all respiratory phases. Accelerated sparse sampling followed by reconstruction enhancement is desired but often results in degraded image quality and long reconstruction time. We hereby propose the chained iterative reconstruction network (CIRNet) for efficient sparse-sa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10629v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10629v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10629v1-abstract-full" style="display: none;"> Abstract Purpose: High-quality 4D MRI requires an impractically long scanning time for dense k-space signal acquisition covering all respiratory phases. Accelerated sparse sampling followed by reconstruction enhancement is desired but often results in degraded image quality and long reconstruction time. We hereby propose the chained iterative reconstruction network (CIRNet) for efficient sparse-sampling reconstruction while maintaining clinically deployable quality. Methods: CIRNet adopts the denoising diffusion probabilistic framework to condition the image reconstruction through a stochastic iterative denoising process. During training, a forward Markovian diffusion process is designed to gradually add Gaussian noise to the densely sampled ground truth (GT), while CIRNet is optimized to iteratively reverse the Markovian process from the forward outputs. At the inference stage, CIRNet performs the reverse process solely to recover signals from noise, conditioned upon the undersampled input. CIRNet processed the 4D data (3D+t) as temporal slices (2D+t). The proposed framework is evaluated on a data cohort consisting of 48 patients (12332 temporal slices) who underwent free-breathing liver 4D MRI. 3-, 6-, 10-, 20- and 30-times acceleration were examined with a retrospective random undersampling scheme. Compressed sensing (CS) reconstruction with a spatiotemporal constraint and a recently proposed deep network, Re-Con-GAN, are selected as baselines. Results: CIRNet consistently achieved superior performance compared to CS and Re-Con-GAN. The inference time of CIRNet, CS, and Re-Con-GAN are 11s, 120s, and 0.15s. Conclusion: A novel framework, CIRNet, is presented. CIRNet maintains useable image quality for acceleration up to 30 times, significantly reducing the burden of 4DMRI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10629v1-abstract-full').style.display = 'none'; document.getElementById('2412.10629v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10104">arXiv:2412.10104</a> <span> [<a href="https://arxiv.org/pdf/2412.10104">pdf</a>, <a href="https://arxiv.org/format/2412.10104">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RETQA: A Large-Scale Open-Domain Tabular Question Answering Dataset for Real Estate Sector </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhensheng Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenmian Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kun Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiquan Zhang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+W">Weijia Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10104v2-abstract-short" style="display: inline;"> The real estate market relies heavily on structured data, such as property details, market trends, and price fluctuations. However, the lack of specialized Tabular Question Answering datasets in this domain limits the development of automated question-answering systems. To fill this gap, we introduce RETQA, the first large-scale open-domain Chinese Tabular Question Answering dataset for Real Estat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10104v2-abstract-full').style.display = 'inline'; document.getElementById('2412.10104v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10104v2-abstract-full" style="display: none;"> The real estate market relies heavily on structured data, such as property details, market trends, and price fluctuations. However, the lack of specialized Tabular Question Answering datasets in this domain limits the development of automated question-answering systems. To fill this gap, we introduce RETQA, the first large-scale open-domain Chinese Tabular Question Answering dataset for Real Estate. RETQA comprises 4,932 tables and 20,762 question-answer pairs across 16 sub-fields within three major domains: property information, real estate company finance information and land auction information. Compared with existing tabular question answering datasets, RETQA poses greater challenges due to three key factors: long-table structures, open-domain retrieval, and multi-domain queries. To tackle these challenges, we propose the SLUTQA framework, which integrates large language models with spoken language understanding tasks to enhance retrieval and answering accuracy. Extensive experiments demonstrate that SLUTQA significantly improves the performance of large language models on RETQA by in-context learning. RETQA and SLUTQA provide essential resources for advancing tabular question answering research in the real estate domain, addressing critical challenges in open-domain and long-table question-answering. The dataset and code are publicly available at \url{https://github.com/jensen-w/RETQA}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10104v2-abstract-full').style.display = 'none'; document.getElementById('2412.10104v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09026">arXiv:2412.09026</a> <span> [<a href="https://arxiv.org/pdf/2412.09026">pdf</a>, <a href="https://arxiv.org/format/2412.09026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Video Anomaly Detection with Motion and Appearance Guided Patch Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hang Zhou</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jiale Cai</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yuteng Ye</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yonghui Feng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chenxing Gao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Junqing Yu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zikai Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09026v1-abstract-short" style="display: inline;"> A recent endeavor in one class of video anomaly detection is to leverage diffusion models and posit the task as a generation problem, where the diffusion model is trained to recover normal patterns exclusively, thus reporting abnormal patterns as outliers. Yet, existing attempts neglect the various formations of anomaly and predict normal samples at the feature level regardless that abnormal objec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09026v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09026v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09026v1-abstract-full" style="display: none;"> A recent endeavor in one class of video anomaly detection is to leverage diffusion models and posit the task as a generation problem, where the diffusion model is trained to recover normal patterns exclusively, thus reporting abnormal patterns as outliers. Yet, existing attempts neglect the various formations of anomaly and predict normal samples at the feature level regardless that abnormal objects in surveillance videos are often relatively small. To address this, a novel patch-based diffusion model is proposed, specifically engineered to capture fine-grained local information. We further observe that anomalies in videos manifest themselves as deviations in both appearance and motion. Therefore, we argue that a comprehensive solution must consider both of these aspects simultaneously to achieve accurate frame prediction. To address this, we introduce innovative motion and appearance conditions that are seamlessly integrated into our patch diffusion model. These conditions are designed to guide the model in generating coherent and contextually appropriate predictions for both semantic content and motion relations. Experimental results in four challenging video anomaly detection datasets empirically substantiate the efficacy of our proposed approach, demonstrating that it consistently outperforms most existing methods in detecting abnormal behaviors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09026v1-abstract-full').style.display = 'none'; document.getElementById('2412.09026v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept by AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08896">arXiv:2412.08896</a> <span> [<a href="https://arxiv.org/pdf/2412.08896">pdf</a>, <a href="https://arxiv.org/format/2412.08896">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LV-CadeNet: Long View Feature Convolution-Attention Fusion Encoder-Decoder Network for Clinical MEG Spike Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+K">Kuntao Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiongfei Wang</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+P">Pengfei Teng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yi Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wanli Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hanyang Dong</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+G">Guoming Luan</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+S">Shurong Sheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08896v1-abstract-short" style="display: inline;"> It is widely acknowledged that the epileptic foci can be pinpointed by source localizing interictal epileptic discharges (IEDs) via Magnetoencephalography (MEG). However, manual detection of IEDs, which appear as spikes in MEG data, is extremely labor intensive and requires considerable professional expertise, limiting the broader adoption of MEG technology. Numerous studies have focused on automa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08896v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08896v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08896v1-abstract-full" style="display: none;"> It is widely acknowledged that the epileptic foci can be pinpointed by source localizing interictal epileptic discharges (IEDs) via Magnetoencephalography (MEG). However, manual detection of IEDs, which appear as spikes in MEG data, is extremely labor intensive and requires considerable professional expertise, limiting the broader adoption of MEG technology. Numerous studies have focused on automatic detection of MEG spikes to overcome this challenge, but these efforts often validate their models on synthetic datasets with balanced positive and negative samples. In contrast, clinical MEG data is highly imbalanced, raising doubts on the real-world efficacy of these models. To address this issue, we introduce LV-CadeNet, a Long View feature Convolution-Attention fusion Encoder-Decoder Network, designed for automatic MEG spike detection in real-world clinical scenarios. Beyond addressing the disparity between training data distribution and clinical test data through semi-supervised learning, our approach also mimics human specialists by constructing long view morphological input data. Moreover, we propose an advanced convolution-attention module to extract temporal and spatial features from the input data. LV-CadeNet significantly improves the accuracy of MEG spike detection, boosting it from 42.31\% to 54.88\% on a novel clinical dataset sourced from Sanbo Brain Hospital Capital Medical University. This dataset, characterized by a highly imbalanced distribution of positive and negative samples, accurately represents real-world clinical scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08896v1-abstract-full').style.display = 'none'; document.getElementById('2412.08896v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.6; I.5.1; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08152">arXiv:2412.08152</a> <span> [<a href="https://arxiv.org/pdf/2412.08152">pdf</a>, <a href="https://arxiv.org/format/2412.08152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ProGDF: Progressive Gaussian Differential Field for Controllable and Flexible 3D Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yian Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wanshi Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yang Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiheng Huang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhongqian Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08152v1-abstract-short" style="display: inline;"> 3D editing plays a crucial role in editing and reusing existing 3D assets, thereby enhancing productivity. Recently, 3DGS-based methods have gained increasing attention due to their efficient rendering and flexibility. However, achieving desired 3D editing results often requires multiple adjustments in an iterative loop, resulting in tens of minutes of training time cost for each attempt and a cum… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08152v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08152v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08152v1-abstract-full" style="display: none;"> 3D editing plays a crucial role in editing and reusing existing 3D assets, thereby enhancing productivity. Recently, 3DGS-based methods have gained increasing attention due to their efficient rendering and flexibility. However, achieving desired 3D editing results often requires multiple adjustments in an iterative loop, resulting in tens of minutes of training time cost for each attempt and a cumbersome trial-and-error cycle for users. This in-the-loop training paradigm results in a poor user experience. To address this issue, we introduce the concept of process-oriented modelling for 3D editing and propose the Progressive Gaussian Differential Field (ProGDF), an out-of-loop training approach that requires only a single training session to provide users with controllable editing capability and variable editing results through a user-friendly interface in real-time. ProGDF consists of two key components: Progressive Gaussian Splatting (PGS) and Gaussian Differential Field (GDF). PGS introduces the progressive constraint to extract the diverse intermediate results of the editing process and employs rendering quality regularization to improve the quality of these results. Based on these intermediate results, GDF leverages a lightweight neural network to model the editing process. Extensive results on two novel applications, namely controllable 3D editing and flexible fine-grained 3D manipulation, demonstrate the effectiveness, practicality and flexibility of the proposed ProGDF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08152v1-abstract-full').style.display = 'none'; document.getElementById('2412.08152v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07277">arXiv:2412.07277</a> <span> [<a href="https://arxiv.org/pdf/2412.07277">pdf</a>, <a href="https://arxiv.org/format/2412.07277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Backdoor Attacks against No-Reference Image Quality Assessment Models via a Scalable Trigger </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Song Xia</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xun Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yap-peng Tan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A">Alex Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07277v2-abstract-short" style="display: inline;"> No-Reference Image Quality Assessment (NR-IQA), responsible for assessing the quality of a single input image without using any reference, plays a critical role in evaluating and optimizing computer vision systems, e.g., low-light enhancement. Recent research indicates that NR-IQA models are susceptible to adversarial attacks, which can significantly alter predicted scores with visually impercepti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07277v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07277v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07277v2-abstract-full" style="display: none;"> No-Reference Image Quality Assessment (NR-IQA), responsible for assessing the quality of a single input image without using any reference, plays a critical role in evaluating and optimizing computer vision systems, e.g., low-light enhancement. Recent research indicates that NR-IQA models are susceptible to adversarial attacks, which can significantly alter predicted scores with visually imperceptible perturbations. Despite revealing vulnerabilities, these attack methods have limitations, including high computational demands, untargeted manipulation, limited practical utility in white-box scenarios, and reduced effectiveness in black-box scenarios. To address these challenges, we shift our focus to another significant threat and present a novel poisoning-based backdoor attack against NR-IQA (BAIQA), allowing the attacker to manipulate the IQA model's output to any desired target value by simply adjusting a scaling coefficient $伪$ for the trigger. We propose to inject the trigger in the discrete cosine transform (DCT) domain to improve the local invariance of the trigger for countering trigger diminishment in NR-IQA models due to widely adopted data augmentations. Furthermore, the universal adversarial perturbations (UAP) in the DCT space are designed as the trigger, to increase IQA model susceptibility to manipulation and improve attack effectiveness. In addition to the heuristic method for poison-label BAIQA (P-BAIQA), we explore the design of clean-label BAIQA (C-BAIQA), focusing on $伪$ sampling and image data refinement, driven by theoretical insights we reveal. Extensive experiments on diverse datasets and various NR-IQA models demonstrate the effectiveness of our attacks. Code can be found at https://github.com/yuyi-sd/BAIQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07277v2-abstract-full').style.display = 'none'; document.getElementById('2412.07277v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07078">arXiv:2412.07078</a> <span> [<a href="https://arxiv.org/pdf/2412.07078">pdf</a>, <a href="https://arxiv.org/format/2412.07078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Defensive Dual Masking for Robust Adversarial Defense </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wangli Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jie Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yi Guo</a>, <a href="/search/cs?searchtype=author&query=Barthelemy%2C+J">Johan Barthelemy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07078v1-abstract-short" style="display: inline;"> The field of textual adversarial defenses has gained considerable attention in recent years due to the increasing vulnerability of natural language processing (NLP) models to adversarial attacks, which exploit subtle perturbations in input text to deceive models. This paper introduces the Defensive Dual Masking (DDM) algorithm, a novel approach designed to enhance model robustness against such att… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07078v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07078v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07078v1-abstract-full" style="display: none;"> The field of textual adversarial defenses has gained considerable attention in recent years due to the increasing vulnerability of natural language processing (NLP) models to adversarial attacks, which exploit subtle perturbations in input text to deceive models. This paper introduces the Defensive Dual Masking (DDM) algorithm, a novel approach designed to enhance model robustness against such attacks. DDM utilizes a unique adversarial training strategy where [MASK] tokens are strategically inserted into training samples to prepare the model to handle adversarial perturbations more effectively. During inference, potentially adversarial tokens are dynamically replaced with [MASK] tokens to neutralize potential threats while preserving the core semantics of the input. The theoretical foundation of our approach is explored, demonstrating how the selective masking mechanism strengthens the model's ability to identify and mitigate adversarial manipulations. Our empirical evaluation across a diverse set of benchmark datasets and attack mechanisms consistently shows that DDM outperforms state-of-the-art defense techniques, improving model accuracy and robustness. Moreover, when applied to Large Language Models (LLMs), DDM also enhances their resilience to adversarial attacks, providing a scalable defense mechanism for large-scale NLP applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07078v1-abstract-full').style.display = 'none'; document.getElementById('2412.07078v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05967">arXiv:2412.05967</a> <span> [<a href="https://arxiv.org/pdf/2412.05967">pdf</a>, <a href="https://arxiv.org/format/2412.05967">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Language hooks: a modular framework for augmenting LLM reasoning that decouples tool usage from the model and its prompt </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=de+Mijolla%2C+D">Damien de Mijolla</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wen Yang</a>, <a href="/search/cs?searchtype=author&query=Duckett%2C+P">Philippa Duckett</a>, <a href="/search/cs?searchtype=author&query=Frye%2C+C">Christopher Frye</a>, <a href="/search/cs?searchtype=author&query=Worrall%2C+M">Mark Worrall</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05967v1-abstract-short" style="display: inline;"> Prompting and fine-tuning have emerged as two competing paradigms for augmenting language models with new capabilities, such as the use of tools. Prompting approaches are quick to set up but rely on providing explicit demonstrations of each tool's usage in the model's prompt, thus coupling tool use to the task at hand and limiting generalisation. Fine-tuning removes the need for task-specific demo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05967v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05967v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05967v1-abstract-full" style="display: none;"> Prompting and fine-tuning have emerged as two competing paradigms for augmenting language models with new capabilities, such as the use of tools. Prompting approaches are quick to set up but rely on providing explicit demonstrations of each tool's usage in the model's prompt, thus coupling tool use to the task at hand and limiting generalisation. Fine-tuning removes the need for task-specific demonstrations of tool usage at runtime; however, this ties new capabilities to a single model, thus making already-heavier setup costs a recurring expense. In this paper, we introduce language hooks, a novel framework for augmenting language models with new capabilities that is decoupled both from the model's task-specific prompt and from the model itself. The language hook algorithm interleaves text generation by the base model with the execution of modular programs that trigger conditionally based on the existing text and the available capabilities. Upon triggering, programs may call external tools, auxiliary language models (e.g. using tool specific prompts), and modify the existing context. We benchmark our method against state-of-the-art baselines, find that it outperforms task-aware approaches, and demonstrate its ability to generalise to novel tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05967v1-abstract-full').style.display = 'none'; document.getElementById('2412.05967v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work was conducted during Summer 2023. Experimental results and references reflect the state of the field at that time and may not account for subsequent developments</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05837">arXiv:2412.05837</a> <span> [<a href="https://arxiv.org/pdf/2412.05837">pdf</a>, <a href="https://arxiv.org/format/2412.05837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tiny Object Detection with Single Point Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Haoran Zhu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruixiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fang Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wen Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haijian Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+G">Gui-Song Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05837v1-abstract-short" style="display: inline;"> Tiny objects, with their limited spatial resolution, often resemble point-like distributions. As a result, bounding box prediction using point-level supervision emerges as a natural and cost-effective alternative to traditional box-level supervision. However, the small scale and lack of distinctive features of tiny objects make point annotations prone to noise, posing significant hurdles for model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05837v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05837v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05837v1-abstract-full" style="display: none;"> Tiny objects, with their limited spatial resolution, often resemble point-like distributions. As a result, bounding box prediction using point-level supervision emerges as a natural and cost-effective alternative to traditional box-level supervision. However, the small scale and lack of distinctive features of tiny objects make point annotations prone to noise, posing significant hurdles for model robustness. To tackle these challenges, we propose Point Teacher--the first end-to-end point-supervised method for robust tiny object detection in aerial images. To handle label noise from scale ambiguity and location shifts in point annotations, Point Teacher employs the teacher-student architecture and decouples the learning into a two-phase denoising process. In this framework, the teacher network progressively denoises the pseudo boxes derived from noisy point annotations, guiding the student network's learning. Specifically, in the first phase, random masking of image regions facilitates regression learning, enabling the teacher to transform noisy point annotations into coarse pseudo boxes. In the second phase, these coarse pseudo boxes are refined using dynamic multiple instance learning, which adaptively selects the most reliable instance from dynamically constructed proposal bags around the coarse pseudo boxes. Extensive experiments on three tiny object datasets (i.e., AI-TOD-v2, SODA-A, and TinyPerson) validate the proposed method's effectiveness and robustness against point location shifts. Notably, relying solely on point supervision, our Point Teacher already shows comparable performance with box-supervised learning methods. Codes and models will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05837v1-abstract-full').style.display = 'none'; document.getElementById('2412.05837v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04853">arXiv:2412.04853</a> <span> [<a href="https://arxiv.org/pdf/2412.04853">pdf</a>, <a href="https://arxiv.org/format/2412.04853">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Budgeted Spatial Data Acquisition: When Coverage and Connectivity Matter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenzhe Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shixun Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiyong Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04853v2-abstract-short" style="display: inline;"> Data is undoubtedly becoming a commodity like oil, land, and labor in the 21st century. Although there have been many successful marketplaces for data trading, the existing data marketplaces lack consideration of the case where buyers want to acquire a collection of datasets (instead of one), and the overall spatial coverage and connectivity matter. In this paper, we take the first attempt to form… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04853v2-abstract-full').style.display = 'inline'; document.getElementById('2412.04853v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04853v2-abstract-full" style="display: none;"> Data is undoubtedly becoming a commodity like oil, land, and labor in the 21st century. Although there have been many successful marketplaces for data trading, the existing data marketplaces lack consideration of the case where buyers want to acquire a collection of datasets (instead of one), and the overall spatial coverage and connectivity matter. In this paper, we take the first attempt to formulate this problem as Budgeted Maximum Coverage with Connectivity Constraint (BMCC), which aims to acquire a dataset collection with the maximum spatial coverage under a limited budget while maintaining spatial connectivity. To solve the problem, we propose two approximate algorithms with detailed theoretical guarantees and time complexity analysis, followed by two acceleration strategies to further improve the efficiency of the algorithm. Experiments are conducted on five real-world spatial dataset collections to verify the efficiency and effectiveness of our algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04853v2-abstract-full').style.display = 'none'; document.getElementById('2412.04853v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository