CINXE.COM
LAPA
<!DOCTYPE html> <html lang="en"> <head> <!-- Google tag (gtag.js) --> <script async src="https://www.googletagmanager.com/gtag/js?id=G-FNGN5K40RL"></script> <script> window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'G-FNGN5K40RL'); </script> <link rel="icon" href="./static/images/lapa_final.ico" type="image/x-icon"> <meta charset="utf-8"> <title>LAPA</title> <meta name="description" content="LAPA: Latent Action Pretraining from Videos"> <meta name="keywords" content="Generalization, Manipulation"> <meta name="viewport" content="width=device-width, initial-scale=1"> <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet"> <link rel="stylesheet" href="./static/css/bulma.min.css"> <link rel="stylesheet" href="./static/css/bulma-carousel.min.css"> <link rel="stylesheet" href="./static/css/bulma-slider.min.css"> <link rel="stylesheet" href="./static/css/fontawesome.all.min.css"> <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"> <link rel="stylesheet" href="./static/css/index.css"> <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script> <script defer src="./static/js/fontawesome.all.min.js"></script> <script src="./static/js/bulma-carousel.min.js"></script> <script src="./static/js/bulma-slider.min.js"></script> <script src="./static/js/index.js"></script> <style> .video-grid { display: grid; grid-template-columns: repeat(3, 1fr); /* Creates a 3-column grid */ gap: 10px; /* Adds some space between the videos */ max-width: 1000px; /* Adjust based on your preference */ margin: auto; /* Center the grid horizontally */ } .video-grid video { width: 100%; /* Makes video fill the cell */ aspect-ratio: 16 / 9; /* Keeps the aspect ratio of videos */ } .video-section-header { text-align: center; margin-top: 20px; margin-bottom: 20px; } #perturbation-task-video { width: 100%; height: auto; } </style> <script> function init() { const video = document.getElementById("perturbation-task-video"); video.addEventListener("error", () => { console.log("Error loading video: ", video.src), ". Setting default to none"; if(video.src.includes("undefined")) { console.log("Don't have an undefined version of the clip, just crash"); return; } const task = document.getElementById("single-menu-tasks").value; const uri = "static/videos/perturbations/" + task + "-undefined.mp4"; video.src = uri; video.playbackRate = 1.75; video.play(); }, true); } function updateSingleVideo() { const task = document.getElementById("single-menu-tasks").value; const perturbation = document.getElementById("single-menu-perturbations").value; const video = document.getElementById("perturbation-task-video"); const uri = "static/videos/perturbations/" + task + "-" + perturbation + ".mp4" video.src = uri; video.playbackRate = 1.75; video.play(); } </script> </head> <body onload="init(); updateSingleVideo();"> <section class="hero"> <div class="hero-body"> <div class="container is-max-desktop"> <div class="columns is-centered"> <div class="column has-text-centered"> <h1 class="title is-1 publication-title"> LAPA: Latent Action Pretraining from Videos </h1> <div class="is-size-5 publication-authors"> <span class="author-block"> <a target="_blank" href="https://seonghyeonye.github.io/">Seonghyeon Ye</a><sup>*</sup><sup>1</sup>, </span> <span class="author-block"> <a target="_blank" href="https://joeljang.github.io/">Joel Jang</a><sup>*</sup><sup>2</sup>, </span><br> <span class="author-block"> <a target="_blank" href="https://scholar.google.com/citations?user=_Kw32VoAAAAJ&hl=ko">Byeongguk Jeon</a><sup>1</sup>, </span> <span class="author-block"> <a target="_blank" href="https://scholar.google.co.kr/citations?user=xii168wAAAAJ&hl">Sejune Joo</a><sup>1</sup>, </span> <span class="author-block"> <a target="_blank" href="https://jwyang.github.io/">Jianwei Yang</a><sup>3</sup>, </span> <span class="author-block"> <a target="_blank" href="https://scholar.google.com/citations?user=u1CNjgwAAAAJ&hl">Baolin Peng</a><sup>3</sup>, </span> <span class="author-block"> <a target="_blank" href="https://ai.stanford.edu/~amandlek/">Ajay Mandlekar</a><sup>4</sup>, </span><br> <span class="author-block"> <a target="_blank" href="https://cs-people.bu.edu/rxtan/">Reuben Tan</a><sup>3</sup>, </span> <span class="author-block"> <a target="_blank" href="https://research.nvidia.com/person/yu-wei-chao">Yu-Wei Chao</a><sup>4</sup>, </span> <span class="author-block"> <a target="_blank" href="https://yuchenlin.xyz/">Yuchen Lin</a><sup>5</sup>, </span> <span class="author-block"> <a target="_blank" href="https://scholar.google.com/citations?user=r4KX3UgAAAAJ&hl">Lars Liden</a><sup>3</sup>, </span><br> <span class="author-block"> <a target="_blank" href="https://sites.google.com/view/kiminlee">Kimin Lee</a><sup>1</sup><sup>†</sup>, </span> <span class="author-block"> <a target="_blank" href="https://www.microsoft.com/en-us/research/people/jfgao/?from=https://research.microsoft.com/en-us/um/people/jfgao/&type=exact">Jianfeng Gao</a><sup>3</sup><sup>†</sup>, </span> <span class="author-block"> <a target="_blank" href="https://www.cs.washington.edu/people/faculty/lsz">Luke Zettlemoyer</a><sup>2</sup><sup>†</sup>, </span> <span class="author-block"> <a target="_blank" href="https://homes.cs.washington.edu/~fox/">Dieter Fox</a><sup>2,4</sup><sup>†</sup>, </span> <span class="author-block"> <a target="_blank" href="https://seominjoon.github.io/">Minjoon Seo</a><sup>1</sup><sup>†</sup> </span> </div> <div class="is-size-5 publication-authors"> <span class="author-block"><sup>1</sup>KAIST</span> <span class="author-block"><sup>2</sup>University of Washington</span><br> <span class="author-block"><sup>3</sup>Microsoft Research</span> <span class="author-block"><sup>4</sup>NVIDIA</span> <span class="author-block"><sup>5</sup>Allen Institute for AI</span> </div> <div class="footnote"> <p>* Equal contribution, † Equal advising</p> </div> <div class="column has-text-centered"> <!-- ArXiv link --> <span class="link-block"> <a target="_blank" href="https://arxiv.org/abs/2410.11758" class="external-link button is-normal is-rounded is-dark"> <span class="icon"><i class="fas fa-file"></i></span> <span>ArXiv</span> </a> </span> <!-- Code Link. --> <span class="link-block"> <a target="_blank" href="https://github.com/LatentActionPretraining/LAPA" class="external-link button is-normal is-rounded is-dark"> <span class="icon"><i class="fab fa-github"></i></span> <span>Code</span> </a> </span> <span class="link-block"> <a href="https://huggingface.co/latent-action-pretraining/LAPA-7B-openx" class="external-link button is-normal is-rounded is-dark"> <span class="icon"> <img src="static/images/hf_icon.svg" /> </span> <span>Model</span> </a> </span> </div> </div> </div> </div> </div> </section> <!-- Teaser and Abstract --> <section class="section"> <div class="container is-max-desktop"> <!-- Video Teaser --> <video id="teaser" autoplay muted loop height="75"> <source src="static/videos/final_final_teaser.mp4" type="video/mp4"> </video> <!-- /Video Teaser --> <br/> <br/> <!-- Abstract --> <div class="columns is-centered has-text-centered"> <div class="column is-four-fifths"> <h2 class="title is-3">Abstract</h2> <div class="content has-text-justified"> <p> We introduce <b>L</b>atent <b>A</b>ction <b>P</b>retraining for general <b>A</b>ction models (LAPA), the first unsupervised method for pretraining Vision-Language-Action (VLA) models without ground-truth robot action labels. Existing Vision-Language-Action models require action labels typically collected by human teleoperators during pretraining, which significantly limits possible data sources and scale. In this work, we propose a method to learn from internet-scale videos that do not have robot action labels. We first train an action quantization model leveraging VQ- VAE-based objective to learn discrete latent actions between image frames, then pretrain a latent VLA model to predict these latent actions from observations and task descriptions, and finally finetune the VLA on small-scale robot manipulation data to map from latent to robot actions. Experimental results demonstrate that our method significantly outperforms existing techniques that train robot manipulation policies from large-scale videos. Furthermore, it outperforms the state-of- the-art VLA model trained with robotic action labels on real-world manipulation tasks that require language conditioning, generalization to unseen objects, and semantic generalization to unseen instructions. Training only on human manipulation videos also shows positive transfer, opening up the potential for leveraging web-scale data for robotics foundation model. </p> </div> </div> </div> <!-- /Abstract --> <div class="columns is-centered has-text-centered"> <div class="column is-four-fifths"> <h2 class="title is-3">Overview of LAPA</h2> <div style="text-align: center;"> <img src="static/images/method.png" alt="Description of the image" style="width: 100%; height: auto;"> </div> <div class="content has-text-justified"> <p> LAPA is divided into two stages: Latent Action Quantization and Latent Pretraining. First, we use a VQ-VAE based objective to capture the discretized latent delta information between consecutive frames in a video. Next, a pretrained VLM is trained to predict the latent action designated by the encoder of the Latent Action Quantization model, given the current image and the language instruction. After Latent Pretraining, we finetune the VLA model on a small number of ground-truth action-labeled trajectories to map the latent space to the actual action space. </p> </div> </div> </div> <div class="columns is-centered has-text-centered"> <div class="column is-four-fifths"> <h2 class="title is-3">Experiments</h2> <div class="content has-text-justified"> <h2 class="title is-4">Real-Robot Experiments</h2></div> <div style="text-align: center;"> <img src="static/images/real_robot.png" alt="Description of the image" style="width: 100%; height: auto;"> </div> <div class="content has-text-justified"> <p> <h2 class="title is-5">Cross-Embodiment</h2> For cross-embodiment setting, we pretrain the VLAs on the WidowX embodiment (Bridgev2) and fine-tune them on the data collected with the Franka robot. By comparing LAPA (Bridge) which does not leverage action-labeled trajectories during pretraining with models that use action-labeled trajectories during pretraining, we observe an interesting finding: <b>LAPA which is pretrained without ground truth action labels, outperform VLAs that use action labeled pretraining data (ActionVLA (Bridge) and OpenVLA (Bridge)) on average success rate of the 3 tasks.</b> We hypothesize that VLA models pretrained on ground truth action labels have overfitted to the WidowX action space from the Bridgev2 dataset, hampering cross-embodiment adaptability to action distribution shifts during fine-tuning. </p> </div> <div class="content has-text-justified"> <p> <h2 class="title is-5">Multi-Embodiment</h2> For multi-embodiment setting, we pretrain the VLAs on Open-X Embodiment which consists of robot trajectories of multiple embodiments. <b>When comparing LAPA (Open-X) with OpenVLA (Open-X), we see that LAPA significantly outperforms OpenVLA on 2 out of 3 tasks.</b> This highlights LAPA's effectiveness in a multi-embodiment setting by showcasing its ability to leverage a shared latent action space during pretraining, akin to how language and image representations are learned in an unsupervised manner. In contrast, contemporary action pretraining methods may suffer from reduced positive transfer between datasets due to the variability in action representation spaces across different embodiments and datasets. </p> </div> <div class="content has-text-justified"> <h2 class="title is-4">Learning from Human Manipulation Videos</h2></div> <div style="text-align: center;"> <img src="static/images/sthv2.png" alt="Description of the image" style="width: 90%; height: auto;"> </div> <div class="content has-text-justified"> <p> To extend LAPA on human manipulation videos where the action labels are not present, we pretrain LAPA on Something-Something V2 Dataset (220K videos) and fine-tune on robot embodiment. The embodiment gap for this case is extreme (human to robot). <b>Surprisingly, we can see that LAPA trained with human videos outperforms OpenVLA (Bridge) on average.</b> Despite the larger embodiment gap for LAPA (Human to robot vs. Robot to robot), it learns a better prior for robot manipulation. This result highlights the potential of raw human manipulation videos from the web compared to expensive robot manipulation data, which requires time-intensive teleoperation to collect. We expect that applying our approach on large-scale internet videos (e.g., YouTube videos) could unlock the potential for large-scale pretraining of a generalist action foundational model, similar to foundational models in Natural Language Processing or Computer Vision. </p> </div> </div> </div> <div class="columns is-centered has-text-centered"> <div class="column is-four-fifths"> <h2 class="title is-3">Analyzing Latent Actions</h2> <div class="content has-text-justified"> <p> For interpretation, we condition the current image observation and each latent action on the decoder of the latent action quantization model, and present the reconstructed images. </p> </div> <div style="text-align: center;"> <img src="static/images/latent_analysis.png" alt="Description of the image" style="width: 90%; height: auto;"> </div> <div class="content has-text-justified"> <p> We observe that each latent action can be mapped into a semantic action of the robot arm. For example, latent action 0 corresponds to moving a bit left and forward. </p> </div> <div style="text-align: center;"> <img src="static/images/sthv2_camera.png" alt="Description of the image" style="width: 90%; height: auto;"> </div> <div class="content has-text-justified"> <p> For human videos where the camera view changes in a single video, we observe that each latent action can be mapped into a semantic action including camera movements. For example, latent action [3,5,2,7] corresponds to moving the camera a bit down while [4,2,0,0] corresponds to moving the camera slightly up. </p> </div> <div style="text-align: center;"> <img src="static/images/latent_analysis_OpenX.png" alt="Description of the image" style="width: 90%; height: auto;"> </div> <div class="content has-text-justified"> <p> For multi-embodiment setting, we observe that each latent action can be mapped into a similar semantic action similar semantic action even though the embodiments are different. <b>This supports our previous claim that latent actions are learned in a shared representation space, regardless of the embodiment or dataset, facilitating stronger positive transfer across diverse datasets.</b> </p> </div> <div style="display: flex; justify-content: center; gap: 20px; margin-top: 40px; margin-bottom: 20px;"> <!-- Left GIF with label --> <div style="text-align: center;"> <img src="static/images/rollout.gif" alt="Rollout GIF" style="width: 65%; height: auto;"> <br><b>Generated Rollout from LAPA</b> </div> <!-- Right GIF with label --> <div style="text-align: center;"> <img src="static/images/rollout_gt.gif" alt="Rollout Ground Truth GIF" style="width: 65%; height: auto;"> <br><b>Ground Truth Trajectory</b> </div> </div> <div class="content has-text-justified"> <p> We analyze the coarse-grained planning capability of LAPA through a closed-loop rollout by using LAPA model that has only undergone pretraining. When conditioned on the current observation and the instruction to "take the broccoli out of the pot", LAPA generates robot trajectories that successfully reaches for the broccoli, moves down to grab it, and, as the arm moves away from the pot, the broccoli disappears. This shows the potential for LAPA as a general-purpose robotic world model, not only predicting actions but also the outcomes of the actions. </p> </div> </div> </div> <div class="columns is-centered has-text-centered"> <div class="column is-four-fifths"> <h2 class="title is-3">Rollout Videos</h2> <div class="content has-text-justified"> <h2 class="title is-4">Seen Objects, Unseen Combinations</h2></div> <div style="display: flex; justify-content: center; gap: 20px; margin-top: 40px; margin-bottom: 20px;"> <!-- Left GIF with label --> <div style="text-align: center;"> <b>Knock mustard down</b> <img src="static/videos/real_videos/knocking_seen_mustard_scratch_fail.gif" alt="Scratch" style="width: 65%; height: auto;"> <br><b>Scratch</b> <br><span style="font-size: 24px;">❌</span> </div> <!-- Right GIF with label --> <div style="text-align: center;"> <b>Knock mustard down</b> <img src="static/videos/real_videos/knocking_seen_mustard_openvla_fail.gif" alt="OpenVLA" style="width: 65%; height: auto;"> <br><b>OpenVLA</b> <br><span style="font-size: 24px;">❌</span> </div> <div style="text-align: center;"> <b>Knock mustard down</b> <img src="static/videos/real_videos/knocking_seen_mustard_lapa_success.gif" alt="LAPA" style="width: 65%; height: auto;"> <br><b>LAPA</b> <br><span style="font-size: 24px;">✅</span> </div> </div> <div style="display: flex; justify-content: center; gap: 20px; margin-top: 40px; margin-bottom: 20px;"> <!-- Left GIF with label --> <div style="text-align: center;"> <b>Pick orange block, put in sink</b> <img src="static/videos/real_videos/pick_seen_orange_scratch_fail.gif" alt="Scratch" style="width: 65%; height: auto;"> <br><b>Scratch</b> <br><span style="font-size: 24px;">❌</span> </div> <!-- Right GIF with label --> <div style="text-align: center;"> <b>Pick orange block, put in sink</b> <img src="static/videos/real_videos/pick_seen_orange_openvla_success.gif" alt="OpenVLA" style="width: 65%; height: auto;"> <br><b>OpenVLA</b> <br><span style="font-size: 24px;">✅</span> </div> <div style="text-align: center;"> <b>Pick orange block, put in sink</b> <img src="static/videos/real_videos/pick_seen_orange_lapa_success.gif" alt="LAPA" style="width: 65%; height: auto;"> <br><b>LAPA</b> <br><span style="font-size: 24px;">✅</span> </div> </div> <div class="content has-text-justified"> <h2 class="title is-4">Unseen Objects</h2></div> <div style="display: flex; justify-content: center; gap: 20px; margin-top: 40px; margin-bottom: 20px;"> <!-- Left GIF with label --> <div style="text-align: center;"> <b>Knock pringles down</b> <img src="static/videos/real_videos/knocking_unseen_pringles_scatch_fail.gif" alt="Scratch" style="width: 65%; height: auto;"> <br><b>Scratch</b> <br><span style="font-size: 24px;">❌</span> </div> <!-- Right GIF with label --> <div style="text-align: center;"> <b>Knock pringles down</b> <img src="static/videos/real_videos/knocking_unseen_pringles_openvla_half.gif" alt="OpenVLA" style="width: 65%; height: auto;"> <br><b>OpenVLA</b> <br><span style="font-size: 24px;">⚠️</span> </div> <div style="text-align: center;"> <b>Knock pringles down</b> <img src="static/videos/real_videos/knocking_unseen_pringles_openvla_half.gif" alt="LAPA" style="width: 65%; height: auto;"> <br><b>LAPA</b> <br><span style="font-size: 24px;">⚠️</span> </div> </div> <div style="display: flex; justify-content: center; gap: 20px; margin-top: 40px; margin-bottom: 20px;"> <div style="text-align: center;"> <b>Cover donut with towel</b> <img src="static/videos/real_videos/covering_unseen_donut_scratch_half.gif" alt="Scratch" style="width: 65%; height: auto;"> <br><b>Scratch</b> <br><span style="font-size: 24px;">⚠️</span> </div> <div style="text-align: center;"> <b>Cover donut with towel</b> <img src="static/videos/real_videos/covering_unseen_donut_openvla_fail.gif" alt="OpenVLA" style="width: 65%; height: auto;"> <br><b>OpenVLA</b> <br><span style="font-size: 24px;">❌</span> </div> <div style="text-align: center;"> <b>Cover donut with towel</b> <img src="static/videos/real_videos/covering_unseen_donut_lapa_success.gif" alt="LAPA" style="width: 65%; height: auto;"> <br><b>LAPA</b> <br><span style="font-size: 24px;">✅</span> </div> </div> <div style="display: flex; justify-content: center; gap: 20px; margin-top: 40px; margin-bottom: 20px;"> <div style="text-align: center;"> <b>Pick paprika, put in sink</b> <img src="static/videos/real_videos/pick_unseen_yellow_scratch_fail.gif" alt="Scratch" style="width: 65%; height: auto;"> <br><b>Scratch</b> <br><span style="font-size: 24px;">❌</span> </div> <div style="text-align: center;"> <b>Pick paprika, put in sink</b> <img src="static/videos/real_videos/pick_unseen_yellow_openvla_success.gif" alt="OpenVLA" style="width: 65%; height: auto;"> <br><b>OpenVLA</b> <br><span style="font-size: 24px;">✅</span> </div> <div style="text-align: center;"> <b>Pick paprika, put in sink</b> <img src="static/videos/real_videos/pick_unseen_yellow_lapa_success.gif" alt="LAPA" style="width: 65%; height: auto;"> <br><b>LAPA</b> <br><span style="font-size: 24px;">✅</span> </div> </div> <div class="content has-text-justified"> <h2 class="title is-4">Unseen Instructions</h2></div> <div style="display: flex; justify-content: center; gap: 20px; margin-top: 40px; margin-bottom: 20px;"> <!-- Left GIF with label --> <div style="text-align: center;"> <b>Knock an object for cleaning</b> <img src="static/videos/real_videos/knocking_semantic_clean_scratch_fail.gif" alt="Scratch" style="width: 65%; height: auto;"> <br><b>Scratch</b> <br><span style="font-size: 24px;">❌</span> </div> <!-- Right GIF with label --> <div style="text-align: center;"> <b>Knock an object for cleaning</b> <img src="static/videos/real_videos/knocking_semantic_clean_openvla_fail.gif" alt="OpenVLA" style="width: 65%; height: auto;"> <br><b>OpenVLA</b> <br><span style="font-size: 24px;">❌</span> </div> <div style="text-align: center;"> <b>Knock an object for cleaning</b> <img src="static/videos/real_videos/knocking_semantic_clean_lapa_sucess.gif" alt="LAPA" style="width: 65%; height: auto;"> <br><b>LAPA</b> <br><span style="font-size: 24px;">✅</span> </div> </div> <div style="display: flex; justify-content: center; gap: 20px; margin-top: 40px; margin-bottom: 20px;"> <div style="text-align: center;"> <b>Cover a yellow object with towel</b> <img src="static/videos/real_videos/covering_semantic_yellow_scratch_fail.gif" alt="Scratch" style="width: 65%; height: auto;"> <br><b>Scratch</b> <br><span style="font-size: 24px;">❌</span> </div> <div style="text-align: center;"> <b>Cover a yellow object with towel</b> <img src="static/videos/real_videos/covering_semantic_yellow_openvla_half.gif" alt="OpenVLA" style="width: 65%; height: auto;"> <br><b>OpenVLA</b> <br><span style="font-size: 24px;">⚠️</span> </div> <div style="text-align: center;"> <b>Cover a yellow object with towel</b> <img src="static/videos/real_videos/covering_semantic_yellow_lapa_half.gif" alt="LAPA" style="width: 65%; height: auto;"> <br><b>LAPA</b> <br><span style="font-size: 24px;">⚠️</span> </div> </div> <div class="content has-text-justified"> <h2 class="title is-4">Bi-Manual</h2> <!-- <p>Both OpenVLA and LAPA struggles on Bi-manual robot setup, indicating much room for improvement.</p> --> </div> <div class="content has-text-justified"> <h2 class="title is-5">Unseen Object Combinations</h2></div> <div style="display: flex; justify-content: center; gap: 20px; margin-top: 0px; margin-bottom: 20px;"> <div style="text-align: center;"> <b>Put gray plate on container and peach on plate</b> <img src="static/videos/real_videos/bimanual_seen_openvla.gif" alt="OpenVLA" style="width: 65%; height: auto;"> <br><b>OpenVLA</b> <br><span style="font-size: 24px;">❌</span> </div> <div style="text-align: center;"> <b>Put gray plate on container and peach on plate</b> <img src="static/videos/real_videos/bimanual_seen_lapa.gif" alt="LAPA" style="width: 65%; height: auto;"> <br><b>LAPA</b> <br><span style="font-size: 24px;">⚠️</span> </div> </div> <div class="content has-text-justified"> <h2 class="title is-5">Unseen Objects</h2></div> <div style="display: flex; justify-content: center; gap: 20px; margin-top: 0px; margin-bottom: 20px;"> <div style="text-align: center;"> <b>Put white plate on container and soup on plate</b> <img src="static/videos/real_videos/bimanual_unseen_openvla.gif" alt="OpenVLA" style="width: 65%; height: auto;"> <br><b>OpenVLA</b> <br><span style="font-size: 24px;">❌</span> </div> <div style="text-align: center;"> <b>Put white plate on container and soup on plate</b> <img src="static/videos/real_videos/bimanual_unseen_lapa.gif" alt="LAPA" style="width: 65%; height: auto;"> <br><b>LAPA</b> <br><span style="font-size: 24px;">⚠️</span> </div> </div> <div class="content has-text-justified"> <h2 class="title is-5">Unseen Instructions</h2></div> <div style="display: flex; justify-content: center; gap: 20px; margin-top: 0px; margin-bottom: 20px;"> <div style="text-align: center;"> <b>Put darker plate on container and round object</b> <img src="static/videos/real_videos/bimanual_semantic_openvla.gif" alt="OpenVLA" style="width: 65%; height: auto;"> <br><b>OpenVLA</b> <br><span style="font-size: 24px;">❌</span> </div> <div style="text-align: center;"> <b>Put darker plate on container and round object</b> <img src="static/videos/real_videos/bimanual_semantic_lapa.gif" alt="LAPA" style="width: 65%; height: auto;"> <br><b>LAPA</b> <br><span style="font-size: 24px;">⚠️</span> </div> </div> <p>Both OpenVLA and LAPA struggles on Bi-manual robot setup, indicating much room for improvement.</p> </div> </div> </div> </section> <!-- BibTeX --> <section class="section" id="BibTeX"> <div class="container is-max-widescreen content"> <h2 class="title">BibTeX</h2> <pre><code>@misc{ye2024latentactionpretrainingvideos, title={Latent Action Pretraining from Videos}, author={Seonghyeon Ye and Joel Jang and Byeongguk Jeon and Sejune Joo and Jianwei Yang and Baolin Peng and Ajay Mandlekar and Reuben Tan and Yu-Wei Chao and Bill Yuchen Lin and Lars Liden and Kimin Lee and Jianfeng Gao and Luke Zettlemoyer and Dieter Fox and Minjoon Seo}, year={2024}, eprint={2410.11758}, archivePrefix={arXiv}, primaryClass={cs.RO}, url={https://arxiv.org/abs/2410.11758}, }</code></pre> </div> </section> <!-- /BibTeX --> <!-- Footer --> <footer class="footer"> <div class="container"> <div class="columns is-centered"> <div class="column"> <div class="content has-text-centered"> <p> Website template borrowed from <a href="https://github.com/nerfies/nerfies.github.io">NeRFies</a> made by the amazing <a href="https://keunhong.com/">Keunhong Park</a>. </p> </div> </div> </div> </div> </footer> <!-- /Footer --> </body> </html>