CINXE.COM
What is Video-Text-to-Text? - Hugging Face
<!doctype html> <html class=""> <head> <meta charset="utf-8" /> <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no" /> <meta name="description" content="Learn about Video-Text-to-Text using Machine Learning" /> <meta property="fb:app_id" content="1321688464574422" /> <meta name="twitter:card" content="summary_large_image" /> <meta name="twitter:site" content="@huggingface" /> <meta name="twitter:image" content="https://huggingface.co/front/thumbnails/v2-2.png" /> <meta property="og:title" content="What is Video-Text-to-Text? - Hugging Face" /> <meta property="og:type" content="website" /> <meta property="og:url" content="https://huggingface.co/tasks/video-text-to-text" /> <meta property="og:image" content="https://huggingface.co/front/thumbnails/v2-2.png" /> <link rel="stylesheet" href="/front/build/kube-6fe56a2/style.css" /> <link rel="preconnect" href="https://fonts.gstatic.com" /> <link href="https://fonts.googleapis.com/css2?family=Source+Sans+Pro:ital,wght@0,200;0,300;0,400;0,600;0,700;0,900;1,200;1,300;1,400;1,600;1,700;1,900&display=swap" rel="stylesheet" /> <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600;700&display=swap" rel="stylesheet" /> <link rel="preload" href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.12.0/katex.min.css" as="style" onload="this.onload=null;this.rel='stylesheet'" /> <noscript> <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.12.0/katex.min.css" /> </noscript> <link rel="canonical" href="https://huggingface.co/tasks/video-text-to-text"> <script type="application/ld+json">{ "@context": "https:\/\/schema.org", "@type": "FAQPage", "mainEntity": [ { "@type": "Question", "name": "What is Video-Text-to-Text?", "acceptedAnswer": { "@type": "Answer", "text": "Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models." } }, { "@type": "Question", "name": "What libraries can I use for Video-Text-to-Text?", "acceptedAnswer": { "@type": "Answer", "text": "The and transformers library is compatible with Video-Text-to-Text." } }, { "@type": "Question", "name": "What models can I use for Video-Text-to-Text?", "acceptedAnswer": { "@type": "Answer", "text": "The llava-hf\/llava-onevision-qwen2-72b-ov-hfand llava-hf\/LLaVA-NeXT-Video-34B-hf models can be used for Video-Text-to-Text." } }, { "@type": "Question", "name": "What datasets can I use for Video-Text-to-Text?", "acceptedAnswer": { "@type": "Answer", "text": "The lmms-lab\/Video-MME, lmms-lab\/VideoChatGPT, and HuggingFaceFV\/finevideo datasets can be used for Video-Text-to-Text." } } ] }</script> <title>What is Video-Text-to-Text? - Hugging Face</title> <script defer data-domain="huggingface.co" event-loggedIn="false" src="/js/script.pageview-props.js" ></script> <script> window.plausible = window.plausible || function () { (window.plausible.q = window.plausible.q || []).push(arguments); }; </script> <script> window.hubConfig = {"features":{"signupDisabled":false},"sshGitUrl":"git@hf.co","moonHttpUrl":"https:\/\/huggingface.co","captchaApiKey":"bd5f2066-93dc-4bdd-a64b-a24646ca3859","captchaDisabledOnSignup":true,"datasetViewerPublicUrl":"https:\/\/datasets-server.huggingface.co","stripePublicKey":"pk_live_x2tdjFXBCvXo2FFmMybezpeM00J6gPCAAc","environment":"production","userAgent":"HuggingFace (production)","spacesIframeDomain":"hf.space","spacesApiUrl":"https:\/\/api.hf.space","docSearchKey":"ece5e02e57300e17d152c08056145326e90c4bff3dd07d7d1ae40cf1c8d39cb6","logoDev":{"apiUrl":"https:\/\/img.logo.dev\/","apiKey":"pk_UHS2HZOeRnaSOdDp7jbd5w"}}; </script> <script type="text/javascript" src="https://de5282c3ca0c.edge.sdk.awswaf.com/de5282c3ca0c/526cf06acb0d/challenge.js" defer></script> </head> <body class="flex flex-col min-h-dvh bg-white dark:bg-gray-950 text-black TaskPage"> <div class="flex min-h-dvh flex-col"> <div class="SVELTE_HYDRATER contents" data-target="MainHeader" data-props="{"classNames":"","isWide":false,"isZh":false,"isPro":false}"><header class="border-b border-gray-100 "><div class="w-full px-4 container flex h-16 items-center"><div class="flex flex-1 items-center"><a class="mr-5 flex flex-none items-center lg:mr-6" href="/"><img alt="Hugging Face's logo" class="w-7 md:mr-2" src="/front/assets/huggingface_logo-noborder.svg"> <span class="hidden whitespace-nowrap text-lg font-bold md:block">Hugging Face</span></a> <div class="relative flex-1 lg:max-w-sm mr-2 sm:mr-4 md:mr-3 xl:mr-6"><input autocomplete="off" class="w-full dark:bg-gray-950 pl-8 form-input-alt h-9 pr-3 focus:shadow-xl " name="" placeholder="Search models, datasets, users..." spellcheck="false" type="text" value=""> <svg class="absolute left-2.5 text-gray-400 top-1/2 transform -translate-y-1/2" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M30 28.59L22.45 21A11 11 0 1 0 21 22.45L28.59 30zM5 14a9 9 0 1 1 9 9a9 9 0 0 1-9-9z" fill="currentColor"></path></svg> </div> <div class="flex flex-none items-center justify-center p-0.5 place-self-stretch lg:hidden"><button class="relative z-40 flex h-6 w-8 items-center justify-center" type="button"><svg width="1em" height="1em" viewBox="0 0 10 10" class="text-xl" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" preserveAspectRatio="xMidYMid meet" fill="currentColor"><path fill-rule="evenodd" clip-rule="evenodd" d="M1.65039 2.9999C1.65039 2.8066 1.80709 2.6499 2.00039 2.6499H8.00039C8.19369 2.6499 8.35039 2.8066 8.35039 2.9999C8.35039 3.1932 8.19369 3.3499 8.00039 3.3499H2.00039C1.80709 3.3499 1.65039 3.1932 1.65039 2.9999ZM1.65039 4.9999C1.65039 4.8066 1.80709 4.6499 2.00039 4.6499H8.00039C8.19369 4.6499 8.35039 4.8066 8.35039 4.9999C8.35039 5.1932 8.19369 5.3499 8.00039 5.3499H2.00039C1.80709 5.3499 1.65039 5.1932 1.65039 4.9999ZM2.00039 6.6499C1.80709 6.6499 1.65039 6.8066 1.65039 6.9999C1.65039 7.1932 1.80709 7.3499 2.00039 7.3499H8.00039C8.19369 7.3499 8.35039 7.1932 8.35039 6.9999C8.35039 6.8066 8.19369 6.6499 8.00039 6.6499H2.00039Z"></path></svg> </button> </div></div> <nav aria-label="Main" class="ml-auto hidden lg:block"><ul class="flex items-center space-x-1.5 2xl:space-x-2"><li class="hover:text-indigo-700"><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400" href="/models"><svg class="mr-1.5 text-gray-400 group-hover:text-indigo-500" style="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-quaternary" d="M20.23 7.24L12 12L3.77 7.24a1.98 1.98 0 0 1 .7-.71L11 2.76c.62-.35 1.38-.35 2 0l6.53 3.77c.29.173.531.418.7.71z" opacity=".25" fill="currentColor"></path><path class="uim-tertiary" d="M12 12v9.5a2.09 2.09 0 0 1-.91-.21L4.5 17.48a2.003 2.003 0 0 1-1-1.73v-7.5a2.06 2.06 0 0 1 .27-1.01L12 12z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M20.5 8.25v7.5a2.003 2.003 0 0 1-1 1.73l-6.62 3.82c-.275.13-.576.198-.88.2V12l8.23-4.76c.175.308.268.656.27 1.01z" fill="currentColor"></path></svg> Models</a> </li><li class="hover:text-red-700"><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400" href="/datasets"><svg class="mr-1.5 text-gray-400 group-hover:text-red-500" style="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 25 25"><ellipse cx="12.5" cy="5" fill="currentColor" fill-opacity="0.25" rx="7.5" ry="2"></ellipse><path d="M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z" fill="currentColor" opacity="0.5"></path><path d="M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z" fill="currentColor" opacity="0.5"></path><path d="M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z" fill="currentColor"></path></svg> Datasets</a> </li><li class="hover:text-blue-700"><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400" href="/spaces"><svg class="mr-1.5 text-gray-400 group-hover:text-blue-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 25 25"><path opacity=".5" d="M6.016 14.674v4.31h4.31v-4.31h-4.31ZM14.674 14.674v4.31h4.31v-4.31h-4.31ZM6.016 6.016v4.31h4.31v-4.31h-4.31Z" fill="currentColor"></path><path opacity=".75" fill-rule="evenodd" clip-rule="evenodd" d="M3 4.914C3 3.857 3.857 3 4.914 3h6.514c.884 0 1.628.6 1.848 1.414a5.171 5.171 0 0 1 7.31 7.31c.815.22 1.414.964 1.414 1.848v6.514A1.914 1.914 0 0 1 20.086 22H4.914A1.914 1.914 0 0 1 3 20.086V4.914Zm3.016 1.102v4.31h4.31v-4.31h-4.31Zm0 12.968v-4.31h4.31v4.31h-4.31Zm8.658 0v-4.31h4.31v4.31h-4.31Zm0-10.813a2.155 2.155 0 1 1 4.31 0 2.155 2.155 0 0 1-4.31 0Z" fill="currentColor"></path><path opacity=".25" d="M16.829 6.016a2.155 2.155 0 1 0 0 4.31 2.155 2.155 0 0 0 0-4.31Z" fill="currentColor"></path></svg> Spaces</a> </li><li class="hover:text-yellow-700 max-xl:hidden"><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400" href="/posts"><svg class="mr-1.5 text-gray-400 group-hover:text-yellow-500 !text-yellow-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 12 12" preserveAspectRatio="xMidYMid meet"><path fill="currentColor" fill-rule="evenodd" d="M3.73 2.4A4.25 4.25 0 1 1 6 10.26H2.17l-.13-.02a.43.43 0 0 1-.3-.43l.01-.06a.43.43 0 0 1 .12-.22l.84-.84A4.26 4.26 0 0 1 3.73 2.4Z" clip-rule="evenodd"></path></svg> Posts</a> </li><li class="hover:text-yellow-700"><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400" href="/docs"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="mr-1.5 text-gray-400 group-hover:text-yellow-500" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path opacity="0.5" d="M20.9022 5.10334L10.8012 10.8791L7.76318 9.11193C8.07741 8.56791 8.5256 8.11332 9.06512 7.7914L15.9336 3.73907C17.0868 3.08811 18.5002 3.26422 19.6534 3.91519L19.3859 3.73911C19.9253 4.06087 20.5879 4.56025 20.9022 5.10334Z" fill="currentColor"></path><path d="M10.7999 10.8792V28.5483C10.2136 28.5475 9.63494 28.4139 9.10745 28.1578C8.5429 27.8312 8.074 27.3621 7.74761 26.7975C7.42122 26.2327 7.24878 25.5923 7.24756 24.9402V10.9908C7.25062 10.3319 7.42358 9.68487 7.74973 9.1123L10.7999 10.8792Z" fill="currentColor" fill-opacity="0.75"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M21.3368 10.8499V6.918C21.3331 6.25959 21.16 5.61234 20.8346 5.03949L10.7971 10.8727L10.8046 10.874L21.3368 10.8499Z" fill="currentColor"></path><path opacity="0.5" d="M21.7937 10.8488L10.7825 10.8741V28.5486L21.7937 28.5234C23.3344 28.5234 24.5835 27.2743 24.5835 25.7335V13.6387C24.5835 12.0979 23.4365 11.1233 21.7937 10.8488Z" fill="currentColor"></path></svg> Docs</a> </li><li class="hover:text-green-700"><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400" href="/enterprise"><svg class="mr-1.5 text-gray-400 group-hover:text-green-500" xmlns="http://www.w3.org/2000/svg" fill="none" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 33 27"><path fill="currentColor" fill-rule="evenodd" d="M13.5.7a8.7 8.7 0 0 0-7.7 5.7L1 20.6c-1 3.1.9 5.7 4.1 5.7h15c3.3 0 6.8-2.6 7.8-5.7l4.6-14.2c1-3.1-.8-5.7-4-5.7h-15Zm1.1 5.7L9.8 20.3h9.8l1-3.1h-5.8l.8-2.5h4.8l1.1-3h-4.8l.8-2.3H23l1-3h-9.5Z" clip-rule="evenodd"></path></svg> Enterprise</a> </li> <li><a class="group flex items-center px-2 py-0.5 hover:text-gray-500 dark:hover:text-gray-400" href="/pricing">Pricing </a></li> <li><div class="relative group"> <button class="px-2 py-0.5 hover:text-gray-500 dark:hover:text-gray-600 flex items-center " type="button"> <svg class=" text-gray-500 w-5 group-hover:text-gray-400 dark:text-gray-300 dark:group-hover:text-gray-400" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 32 18" preserveAspectRatio="xMidYMid meet"><path fill-rule="evenodd" clip-rule="evenodd" d="M14.4504 3.30221C14.4504 2.836 14.8284 2.45807 15.2946 2.45807H28.4933C28.9595 2.45807 29.3374 2.836 29.3374 3.30221C29.3374 3.76842 28.9595 4.14635 28.4933 4.14635H15.2946C14.8284 4.14635 14.4504 3.76842 14.4504 3.30221Z" fill="currentColor"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M14.4504 9.00002C14.4504 8.53382 14.8284 8.15588 15.2946 8.15588H28.4933C28.9595 8.15588 29.3374 8.53382 29.3374 9.00002C29.3374 9.46623 28.9595 9.84417 28.4933 9.84417H15.2946C14.8284 9.84417 14.4504 9.46623 14.4504 9.00002Z" fill="currentColor"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M14.4504 14.6978C14.4504 14.2316 14.8284 13.8537 15.2946 13.8537H28.4933C28.9595 13.8537 29.3374 14.2316 29.3374 14.6978C29.3374 15.164 28.9595 15.542 28.4933 15.542H15.2946C14.8284 15.542 14.4504 15.164 14.4504 14.6978Z" fill="currentColor"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M1.94549 6.87377C2.27514 6.54411 2.80962 6.54411 3.13928 6.87377L6.23458 9.96907L9.32988 6.87377C9.65954 6.54411 10.194 6.54411 10.5237 6.87377C10.8533 7.20343 10.8533 7.73791 10.5237 8.06756L6.23458 12.3567L1.94549 8.06756C1.61583 7.73791 1.61583 7.20343 1.94549 6.87377Z" fill="currentColor"></path></svg> </button> </div></li> <li><hr class="h-5 w-0.5 border-none bg-gray-100 dark:bg-gray-800"></li> <li><a class="block cursor-pointer whitespace-nowrap px-2 py-0.5 hover:text-gray-500 dark:hover:text-gray-400" href="/login">Log In </a></li> <li><a class="whitespace-nowrap rounded-full border border-transparent bg-gray-900 px-3 py-1 leading-none text-white hover:border-black hover:bg-white hover:text-black" href="/join">Sign Up </a></li></ul></nav></div></header></div> <div class="SVELTE_HYDRATER contents" data-target="SSOBanner" data-props="{}"></div> <main class="flex flex-1 flex-col"><div class="container relative grid w-full grid-cols-1 gap-8 space-y-4 pb-32 lg:grid-cols-12"><div class="flex-none space-y-12 pt-6 md:pt-12 lg:col-span-7 2xl:pr-16"><section class="max-w-2xl"><a href="/tasks" class="mb-1 flex items-center text-gray-400 hover:underline"><svg class="mr-1.5 text-sm text-gray-300" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M10 16L20 6l1.4 1.4l-8.6 8.6l8.6 8.6L20 26z" fill="currentColor"></path></svg> Tasks</a> <div class="mb-8 flex items-center"><svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 32 32" aria-hidden="true" class="text-3xl mr-2.5 text-blue-400"><path fill="currentColor" d="m30 19l-4 3.2V20a2 2 0 0 0-2-2h-8a2 2 0 0 0-2 2v6a2 2 0 0 0 2 2h8a2 2 0 0 0 2-2v-2.2l4 3.2Zm-14 7v-6h8l.002 6Z"></path><path fill="currentColor" d="M12 28H8V4h8v6a2.006 2.006 0 0 0 2 2h6v3h2v-5a.91.91 0 0 0-.3-.7l-7-7A.9.9 0 0 0 18 2H8a2.006 2.006 0 0 0-2 2v24a2.006 2.006 0 0 0 2 2h4Zm6-23.6l5.6 5.6H18Z"></path></svg> <h1 class="text-3xl font-bold xl:text-4xl">Video-Text-to-Text</h1></div> <p class="text-[1.2rem] text-gray-500">Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.</p> </section> <section class="max-w-2xl 2xl:max-w-3xl"><div class="relative z-0 grid grid-cols-1 gap-4 md:min-h-[250px] md:grid-cols-11"><div class="space-y-2 rounded-xl bg-gradient-to-br from-blue-100 to-transparent p-4 dark:from-gray-800 md:col-span-4"><div class="text-xl font-semibold text-blue-500">Inputs</div> <div class="pt-1.5"><img alt="" class="overflow-hidden rounded-lg object-cover" src="https://huggingface.co/datasets/huggingfacejs/tasks/resolve/main/video-text-to-text/video-text-to-text-input.gif"></div><div><h6 class="font-semibold">Text Prompt</h6> <p class="text-gray-500">What is happening in this video?</p></div></div> <div class="relative flex min-h-[150px] items-center justify-center space-y-2 rounded-xl bg-gradient-to-br p-8 dark:from-gray-800 md:col-span-3 from-blue-100"><svg class="z-10 absolute top-0 -left-14 hidden md:block text-blue-300" width="87" height="30" viewBox="0 0 87 30" fill="none" xmlns="http://www.w3.org/2000/svg" style=""><defs><linearGradient id="linear-gradient-e0a085jn5"><stop offset="0%" stop-color="#dbeafe"></stop><stop offset="100%" stop-color="currentColor"></stop></linearGradient></defs><path d="M0.976675 22.6978C0.451106 23.3381 0.544167 24.2833 1.18453 24.8089C1.8249 25.3345 2.77008 25.2414 3.29565 24.601L0.976675 22.6978ZM85.8237 27.948L81.0878 11.2875L69.0273 23.7192L85.8237 27.948ZM3.29565 24.601C8.9058 17.7655 18.9303 9.8542 31.4914 6.99576C43.9669 4.1568 59.1046 6.26142 75.1673 19.694L77.0918 17.3927C60.3459 3.38874 44.2588 1.01368 30.8258 4.07055C17.4783 7.10792 6.91044 15.4679 0.976675 22.6978L3.29565 24.601Z" fill="url(#linear-gradient-e0a085jn5)"></path></svg> <svg class="z-10 absolute bottom-0 -right-14 hidden md:block text-blue-100" width="87" height="30" viewBox="0 0 87 30" fill="none" xmlns="http://www.w3.org/2000/svg" style="transform: scaleX(-1) rotate(180deg);"><defs><linearGradient id="linear-gradient-o3j6m5h7v"><stop offset="0%" stop-color="currentColor"></stop><stop offset="100%" stop-color="#86efac"></stop></linearGradient></defs><path d="M0.976675 22.6978C0.451106 23.3381 0.544167 24.2833 1.18453 24.8089C1.8249 25.3345 2.77008 25.2414 3.29565 24.601L0.976675 22.6978ZM85.8237 27.948L81.0878 11.2875L69.0273 23.7192L85.8237 27.948ZM3.29565 24.601C8.9058 17.7655 18.9303 9.8542 31.4914 6.99576C43.9669 4.1568 59.1046 6.26142 75.1673 19.694L77.0918 17.3927C60.3459 3.38874 44.2588 1.01368 30.8258 4.07055C17.4783 7.10792 6.91044 15.4679 0.976675 22.6978L3.29565 24.601Z" fill="url(#linear-gradient-o3j6m5h7v)"></path></svg> <svg class="z-10 absolute -top-9 md:hidden text-blue-300" width="19" height="47" viewBox="0 0 19 47" fill="none" xmlns="http://www.w3.org/2000/svg"><defs><linearGradient id="linear-gradient-uxc07f6e5" x1="0%" y1="0%" x2="0%" y2="100%"><stop offset="0%" stop-color="#dbeafe"></stop><stop offset="100%" stop-color="currentColor"></stop></linearGradient></defs><path fill-rule="evenodd" clip-rule="evenodd" d="M7.95615 0.0566406H10.9562V31.0695H18.1164L9.45615 46.0695L0.795898 31.0695H7.95615V0.0566406Z" fill="url(#linear-gradient-uxc07f6e5)"></path></svg> <svg class="z-10 absolute -bottom-9 md:hidden text-blue-100" width="19" height="47" viewBox="0 0 19 47" fill="none" xmlns="http://www.w3.org/2000/svg"><defs><linearGradient id="linear-gradient-tnwepg43l" x1="0%" y1="0%" x2="0%" y2="100%"><stop offset="0%" stop-color="currentColor"></stop><stop offset="100%" stop-color="#86efac"></stop></linearGradient></defs><path fill-rule="evenodd" clip-rule="evenodd" d="M7.95615 0.0566406H10.9562V31.0695H18.1164L9.45615 46.0695L0.795898 31.0695H7.95615V0.0566406Z" fill="url(#linear-gradient-tnwepg43l)"></path></svg> <div class="absolute inset-0 flex items-center justify-center"><svg class="text-9xl -mt-2 leading-tight opacity-50 dark:opacity-100 dark:text-gray-700 text-blue-400" style="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-quaternary" d="M20.23 7.24L12 12L3.77 7.24a1.98 1.98 0 0 1 .7-.71L11 2.76c.62-.35 1.38-.35 2 0l6.53 3.77c.29.173.531.418.7.71z" opacity=".25" fill="currentColor"></path><path class="uim-tertiary" d="M12 12v9.5a2.09 2.09 0 0 1-.91-.21L4.5 17.48a2.003 2.003 0 0 1-1-1.73v-7.5a2.06 2.06 0 0 1 .27-1.01L12 12z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M20.5 8.25v7.5a2.003 2.003 0 0 1-1 1.73l-6.62 3.82c-.275.13-.576.198-.88.2V12l8.23-4.76c.175.308.268.656.27 1.01z" fill="currentColor"></path></svg></div> <div class="z-10 text-center text-xl font-bold leading-7">Video-Text-to-Text Model </div></div> <div class="relative space-y-2 rounded-xl bg-gradient-to-br from-green-100 to-transparent p-4 pb-10 dark:from-gray-800 md:col-span-4"><div class="text-xl font-semibold text-green-500">Output</div> <div><h6 class="font-semibold">Answer</h6> <p class="text-gray-500">The video shows a series of images showing a fountain with water jets and a variety of colorful flowers and butterflies in the background.</p></div></div></div></section> <section class="copiable-code-container max-w-2xl"><div class="SVELTE_HYDRATER contents" data-target="RepoCodeCopy" data-props="{}"><div></div></div> <h2 class="mb-8 text-3xl font-semibold">About Video-Text-to-Text</h2> <div class="prose-doc prose"><!-- HTML_TAG_START --><p>Most of the video language models can take in videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs, which can have images and videos inside the text, where you can refer to the input images and input videos within the text prompt.</p> <h2 class="relative group flex items-center"> <a id="different-types-of-video-language-models" class="block pr-1.5 text-lg md:absolute md:p-1.5 md:opacity-0 md:group-hover:opacity-100 md:right-full" href="#different-types-of-video-language-models" > <span class="header-link"><svg class="text-gray-500 hover:text-black dark:hover:text-gray-200 w-4" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span> </a> <span> Different Types of Video Language Models </span> </h2> <p>Video language models come in three types:</p> <ul> <li><strong>Base:</strong> Pre-trained models that can be fine-tuned.</li> <li><strong>Instruction:</strong> Base models fine-tuned on video-instruction pairs and answers.</li> <li><strong>Chatty/Conversational:</strong> Base models fine-tuned on video conversation datasets.</li> </ul> <h2 class="relative group flex items-center"> <a id="use-cases" class="block pr-1.5 text-lg md:absolute md:p-1.5 md:opacity-0 md:group-hover:opacity-100 md:right-full" href="#use-cases" > <span class="header-link"><svg class="text-gray-500 hover:text-black dark:hover:text-gray-200 w-4" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span> </a> <span> Use Cases </span> </h2> <h3 class="relative group flex items-center"> <a id="video-question-answering" class="block pr-1.5 text-lg md:absolute md:p-1.5 md:opacity-0 md:group-hover:opacity-100 md:right-full" href="#video-question-answering" > <span class="header-link"><svg class="text-gray-500 hover:text-black dark:hover:text-gray-200 w-4" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span> </a> <span> Video Question Answering </span> </h3> <p>Video language models trained on video-question-answer pairs can be used for video question answering and generating captions for videos.</p> <h3 class="relative group flex items-center"> <a id="video-chat" class="block pr-1.5 text-lg md:absolute md:p-1.5 md:opacity-0 md:group-hover:opacity-100 md:right-full" href="#video-chat" > <span class="header-link"><svg class="text-gray-500 hover:text-black dark:hover:text-gray-200 w-4" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span> </a> <span> Video Chat </span> </h3> <p>Video language models can be used to have a dialogue about a video.</p> <h3 class="relative group flex items-center"> <a id="video-recognition-with-instructions" class="block pr-1.5 text-lg md:absolute md:p-1.5 md:opacity-0 md:group-hover:opacity-100 md:right-full" href="#video-recognition-with-instructions" > <span class="header-link"><svg class="text-gray-500 hover:text-black dark:hover:text-gray-200 w-4" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span> </a> <span> Video Recognition with Instructions </span> </h3> <p>Video language models can recognize images through descriptions. When given detailed descriptions of specific entities, they can classify the entities in a video.</p> <h2 class="relative group flex items-center"> <a id="inference" class="block pr-1.5 text-lg md:absolute md:p-1.5 md:opacity-0 md:group-hover:opacity-100 md:right-full" href="#inference" > <span class="header-link"><svg class="text-gray-500 hover:text-black dark:hover:text-gray-200 w-4" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span> </a> <span> Inference </span> </h2> <p>You can use the Transformers library to interact with video-language models. Below we load <a href="https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf">a video language model</a>, write a simple utility to sample videos, use chat template to format the text prompt, process the video and the text prompt and infer. To run the snippet below, please install <a href="https://pypi.org/project/opencv-python/">OpenCV</a> by running <code>pip install opencv-python</code>.</p> <pre><code class="language-python"><span class="hljs-keyword">import</span> uuid <span class="hljs-keyword">import</span> requests <span class="hljs-keyword">import</span> cv2 <span class="hljs-keyword">import</span> torch <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration device = <span class="hljs-string">"cuda"</span> <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> <span class="hljs-string">"cpu"</span> model_id = <span class="hljs-string">"llava-hf/LLaVA-NeXT-Video-7B-hf"</span> model = LlavaNextVideoForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.float16, low_cpu_mem_usage=<span class="hljs-literal">True</span>, ).to(device) processor = LlavaNextVideoProcessor.from_pretrained(model_id) <span class="hljs-keyword">def</span> <span class="hljs-title function_">sample_frames</span>(<span class="hljs-params">url, num_frames</span>): response = requests.get(url) path_id = <span class="hljs-built_in">str</span>(uuid.uuid4()) path = <span class="hljs-string">f"./<span class="hljs-subst">{path_id}</span>.mp4"</span> <span class="hljs-keyword">with</span> <span class="hljs-built_in">open</span>(path, <span class="hljs-string">"wb"</span>) <span class="hljs-keyword">as</span> f: f.write(response.content) video = cv2.VideoCapture(path) total_frames = <span class="hljs-built_in">int</span>(video.get(cv2.CAP_PROP_FRAME_COUNT)) interval = total_frames // num_frames frames = [] <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(total_frames): ret, frame = video.read() <span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> ret: <span class="hljs-keyword">continue</span> <span class="hljs-keyword">if</span> i % interval == <span class="hljs-number">0</span>: pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) frames.append(pil_img) video.release() <span class="hljs-keyword">return</span> frames conversation = [ { <span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: [ {<span class="hljs-string">"type"</span>: <span class="hljs-string">"text"</span>, <span class="hljs-string">"text"</span>: <span class="hljs-string">"Why is this video funny?"</span>}, {<span class="hljs-string">"type"</span>: <span class="hljs-string">"video"</span>}, ], }, ] prompt = processor.apply_chat_template(conversation, add_generation_prompt=<span class="hljs-literal">True</span>) video_url = <span class="hljs-string">"https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_1.mp4"</span> video = sample_frames(video, <span class="hljs-number">8</span>) inputs = processor(text=prompt, videos=video, padding=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">"pt"</span>).to(model.device) output = model.generate(**inputs, max_new_tokens=<span class="hljs-number">100</span>, do_sample=<span class="hljs-literal">False</span>) <span class="hljs-built_in">print</span>(processor.decode(output[<span class="hljs-number">0</span>][<span class="hljs-number">2</span>:], skip_special_tokens=<span class="hljs-literal">True</span>)) <span class="hljs-comment"># Why is this video funny? ASSISTANT: The humor in this video comes from the cat's facial expression and body language. The cat appears to be making a funny face, with its eyes squinted and mouth open, which can be interpreted as a playful or mischievous expression. Cats often make such faces when they are in a good mood or are playful, and this can be amusing to people who are familiar with their behavior. The combination of the cat's expression and the close-</span> </code></pre> <h2 class="relative group flex items-center"> <a id="useful-resources" class="block pr-1.5 text-lg md:absolute md:p-1.5 md:opacity-0 md:group-hover:opacity-100 md:right-full" href="#useful-resources" > <span class="header-link"><svg class="text-gray-500 hover:text-black dark:hover:text-gray-200 w-4" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span> </a> <span> Useful Resources </span> </h2> <ul> <li><a href="https://huggingface.co/docs/transformers/tasks/video_text_to_text">Transformers task guide on video-text-to-text</a></li> </ul> <!-- HTML_TAG_END --></div></section></div> <div class="flex-none space-y-8 pt-12 lg:col-span-5"><div class="flex flex-col gap-2 sm:flex-row"> </div> <div><h2 class="mb-2.5 font-semibold">Compatible libraries</h2> <div class="flex flex-wrap items-center overflow-hidden"><a class="mb-1 mr-1 md:mb-1.5 md:mr-1.5 rounded-lg" href="/models?library=transformers&pipeline_tag=video-text-to-text"><div class="tag tag-white "><svg class="text-black inline-block text-sm" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" preserveAspectRatio="xMidYMid meet" width="1em" height="1em" viewBox="0 0 95 88"><path fill="#fff" d="M94.25 70.08a8.28 8.28 0 0 1-.43 6.46 10.57 10.57 0 0 1-3 3.6 25.18 25.18 0 0 1-5.7 3.2 65.74 65.74 0 0 1-7.56 2.65 46.67 46.67 0 0 1-11.42 1.68c-5.42.05-10.09-1.23-13.4-4.5a40.4 40.4 0 0 1-10.14.03c-3.34 3.25-7.99 4.52-13.39 4.47a46.82 46.82 0 0 1-11.43-1.68 66.37 66.37 0 0 1-7.55-2.65c-2.28-.98-4.17-2-5.68-3.2a10.5 10.5 0 0 1-3.02-3.6c-.99-2-1.18-4.3-.42-6.46a8.54 8.54 0 0 1-.33-5.63c.25-.95.66-1.83 1.18-2.61a8.67 8.67 0 0 1 2.1-8.47 8.23 8.23 0 0 1 2.82-2.07 41.75 41.75 0 1 1 81.3-.12 8.27 8.27 0 0 1 3.11 2.19 8.7 8.7 0 0 1 2.1 8.47c.52.78.93 1.66 1.18 2.61a8.61 8.61 0 0 1-.32 5.63Z"></path><path fill="#FFD21E" d="M47.21 76.5a34.75 34.75 0 1 0 0-69.5 34.75 34.75 0 0 0 0 69.5Z"></path><path fill="#FF9D0B" d="M81.96 41.75a34.75 34.75 0 1 0-69.5 0 34.75 34.75 0 0 0 69.5 0Zm-73.5 0a38.75 38.75 0 1 1 77.5 0 38.75 38.75 0 0 1-77.5 0Z"></path><path fill="#3A3B45" d="M58.5 32.3c1.28.44 1.78 3.06 3.07 2.38a5 5 0 1 0-6.76-2.07c.61 1.15 2.55-.72 3.7-.32ZM34.95 32.3c-1.28.44-1.79 3.06-3.07 2.38a5 5 0 1 1 6.76-2.07c-.61 1.15-2.56-.72-3.7-.32Z"></path><path fill="#FF323D" d="M46.96 56.29c9.83 0 13-8.76 13-13.26 0-2.34-1.57-1.6-4.09-.36-2.33 1.15-5.46 2.74-8.9 2.74-7.19 0-13-6.88-13-2.38s3.16 13.26 13 13.26Z"></path><path fill="#3A3B45" fill-rule="evenodd" d="M39.43 54a8.7 8.7 0 0 1 5.3-4.49c.4-.12.81.57 1.24 1.28.4.68.82 1.37 1.24 1.37.45 0 .9-.68 1.33-1.35.45-.7.89-1.38 1.32-1.25a8.61 8.61 0 0 1 5 4.17c3.73-2.94 5.1-7.74 5.1-10.7 0-2.34-1.57-1.6-4.09-.36l-.14.07c-2.31 1.15-5.39 2.67-8.77 2.67s-6.45-1.52-8.77-2.67c-2.6-1.29-4.23-2.1-4.23.29 0 3.05 1.46 8.06 5.47 10.97Z" clip-rule="evenodd"></path><path fill="#FF9D0B" d="M70.71 37a3.25 3.25 0 1 0 0-6.5 3.25 3.25 0 0 0 0 6.5ZM24.21 37a3.25 3.25 0 1 0 0-6.5 3.25 3.25 0 0 0 0 6.5ZM17.52 48c-1.62 0-3.06.66-4.07 1.87a5.97 5.97 0 0 0-1.33 3.76 7.1 7.1 0 0 0-1.94-.3c-1.55 0-2.95.59-3.94 1.66a5.8 5.8 0 0 0-.8 7 5.3 5.3 0 0 0-1.79 2.82c-.24.9-.48 2.8.8 4.74a5.22 5.22 0 0 0-.37 5.02c1.02 2.32 3.57 4.14 8.52 6.1 3.07 1.22 5.89 2 5.91 2.01a44.33 44.33 0 0 0 10.93 1.6c5.86 0 10.05-1.8 12.46-5.34 3.88-5.69 3.33-10.9-1.7-15.92-2.77-2.78-4.62-6.87-5-7.77-.78-2.66-2.84-5.62-6.25-5.62a5.7 5.7 0 0 0-4.6 2.46c-1-1.26-1.98-2.25-2.86-2.82A7.4 7.4 0 0 0 17.52 48Zm0 4c.51 0 1.14.22 1.82.65 2.14 1.36 6.25 8.43 7.76 11.18.5.92 1.37 1.31 2.14 1.31 1.55 0 2.75-1.53.15-3.48-3.92-2.93-2.55-7.72-.68-8.01.08-.02.17-.02.24-.02 1.7 0 2.45 2.93 2.45 2.93s2.2 5.52 5.98 9.3c3.77 3.77 3.97 6.8 1.22 10.83-1.88 2.75-5.47 3.58-9.16 3.58-3.81 0-7.73-.9-9.92-1.46-.11-.03-13.45-3.8-11.76-7 .28-.54.75-.76 1.34-.76 2.38 0 6.7 3.54 8.57 3.54.41 0 .7-.17.83-.6.79-2.85-12.06-4.05-10.98-8.17.2-.73.71-1.02 1.44-1.02 3.14 0 10.2 5.53 11.68 5.53.11 0 .2-.03.24-.1.74-1.2.33-2.04-4.9-5.2-5.21-3.16-8.88-5.06-6.8-7.33.24-.26.58-.38 1-.38 3.17 0 10.66 6.82 10.66 6.82s2.02 2.1 3.25 2.1c.28 0 .52-.1.68-.38.86-1.46-8.06-8.22-8.56-11.01-.34-1.9.24-2.85 1.31-2.85Z"></path><path fill="#FFD21E" d="M38.6 76.69c2.75-4.04 2.55-7.07-1.22-10.84-3.78-3.77-5.98-9.3-5.98-9.3s-.82-3.2-2.69-2.9c-1.87.3-3.24 5.08.68 8.01 3.91 2.93-.78 4.92-2.29 2.17-1.5-2.75-5.62-9.82-7.76-11.18-2.13-1.35-3.63-.6-3.13 2.2.5 2.79 9.43 9.55 8.56 11-.87 1.47-3.93-1.71-3.93-1.71s-9.57-8.71-11.66-6.44c-2.08 2.27 1.59 4.17 6.8 7.33 5.23 3.16 5.64 4 4.9 5.2-.75 1.2-12.28-8.53-13.36-4.4-1.08 4.11 11.77 5.3 10.98 8.15-.8 2.85-9.06-5.38-10.74-2.18-1.7 3.21 11.65 6.98 11.76 7.01 4.3 1.12 15.25 3.49 19.08-2.12Z"></path><path fill="#FF9D0B" d="M77.4 48c1.62 0 3.07.66 4.07 1.87a5.97 5.97 0 0 1 1.33 3.76 7.1 7.1 0 0 1 1.95-.3c1.55 0 2.95.59 3.94 1.66a5.8 5.8 0 0 1 .8 7 5.3 5.3 0 0 1 1.78 2.82c.24.9.48 2.8-.8 4.74a5.22 5.22 0 0 1 .37 5.02c-1.02 2.32-3.57 4.14-8.51 6.1-3.08 1.22-5.9 2-5.92 2.01a44.33 44.33 0 0 1-10.93 1.6c-5.86 0-10.05-1.8-12.46-5.34-3.88-5.69-3.33-10.9 1.7-15.92 2.78-2.78 4.63-6.87 5.01-7.77.78-2.66 2.83-5.62 6.24-5.62a5.7 5.7 0 0 1 4.6 2.46c1-1.26 1.98-2.25 2.87-2.82A7.4 7.4 0 0 1 77.4 48Zm0 4c-.51 0-1.13.22-1.82.65-2.13 1.36-6.25 8.43-7.76 11.18a2.43 2.43 0 0 1-2.14 1.31c-1.54 0-2.75-1.53-.14-3.48 3.91-2.93 2.54-7.72.67-8.01a1.54 1.54 0 0 0-.24-.02c-1.7 0-2.45 2.93-2.45 2.93s-2.2 5.52-5.97 9.3c-3.78 3.77-3.98 6.8-1.22 10.83 1.87 2.75 5.47 3.58 9.15 3.58 3.82 0 7.73-.9 9.93-1.46.1-.03 13.45-3.8 11.76-7-.29-.54-.75-.76-1.34-.76-2.38 0-6.71 3.54-8.57 3.54-.42 0-.71-.17-.83-.6-.8-2.85 12.05-4.05 10.97-8.17-.19-.73-.7-1.02-1.44-1.02-3.14 0-10.2 5.53-11.68 5.53-.1 0-.19-.03-.23-.1-.74-1.2-.34-2.04 4.88-5.2 5.23-3.16 8.9-5.06 6.8-7.33-.23-.26-.57-.38-.98-.38-3.18 0-10.67 6.82-10.67 6.82s-2.02 2.1-3.24 2.1a.74.74 0 0 1-.68-.38c-.87-1.46 8.05-8.22 8.55-11.01.34-1.9-.24-2.85-1.31-2.85Z"></path><path fill="#FFD21E" d="M56.33 76.69c-2.75-4.04-2.56-7.07 1.22-10.84 3.77-3.77 5.97-9.3 5.97-9.3s.82-3.2 2.7-2.9c1.86.3 3.23 5.08-.68 8.01-3.92 2.93.78 4.92 2.28 2.17 1.51-2.75 5.63-9.82 7.76-11.18 2.13-1.35 3.64-.6 3.13 2.2-.5 2.79-9.42 9.55-8.55 11 .86 1.47 3.92-1.71 3.92-1.71s9.58-8.71 11.66-6.44c2.08 2.27-1.58 4.17-6.8 7.33-5.23 3.16-5.63 4-4.9 5.2.75 1.2 12.28-8.53 13.36-4.4 1.08 4.11-11.76 5.3-10.97 8.15.8 2.85 9.05-5.38 10.74-2.18 1.69 3.21-11.65 6.98-11.76 7.01-4.31 1.12-15.26 3.49-19.08-2.12Z"></path></svg> <span>Transformers</span> </div></a></div></div> <section class="shadow-alternate-sm rounded-xl border border-gray-100 px-4 pb-4 pt-3 "><div class="mb-1.5 flex items-center whitespace-nowrap font-semibold text-gray-800 lg:text-lg"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" class="mr-1.5 flex-none text-yellow-400" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path d="M11 15H6l7-14v8h5l-7 14v-8z" fill="currentColor"></path></svg> Video-Text-to-Text demo</div> <p class="mb-2 text-gray-500">No example widget is defined for this task.</p> <p class="flex-none text-sm italic text-gray-400"><span class="mr-1 rounded bg-gray-100 px-1 font-mono text-xs font-semibold not-italic text-gray-500 dark:bg-gray-800">Note</span> Contribute by proposing a widget for this task !</p> </section> <section class="shadow-alternate-sm rounded-xl border border-gray-100 px-4 pb-4 pt-3 "><div class="mb-1.5 flex items-center whitespace-nowrap font-semibold text-gray-800 lg:text-lg"><svg class="mr-1.5 flex-none text-indigo-400" style="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-quaternary" d="M20.23 7.24L12 12L3.77 7.24a1.98 1.98 0 0 1 .7-.71L11 2.76c.62-.35 1.38-.35 2 0l6.53 3.77c.29.173.531.418.7.71z" opacity=".25" fill="currentColor"></path><path class="uim-tertiary" d="M12 12v9.5a2.09 2.09 0 0 1-.91-.21L4.5 17.48a2.003 2.003 0 0 1-1-1.73v-7.5a2.06 2.06 0 0 1 .27-1.01L12 12z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M20.5 8.25v7.5a2.003 2.003 0 0 1-1 1.73l-6.62 3.82c-.275.13-.576.198-.88.2V12l8.23-4.76c.175.308.268.656.27 1.01z" fill="currentColor"></path></svg> <div class="truncate">Models for Video-Text-to-Text</div> <a class="btn ml-4 text-sm font-normal" href="/models?pipeline_tag=video-text-to-text">Browse Models (44)</a> </div> <div class="mt-4 space-y-4"><div><div class="mb-2 overflow-hidden"><article class="overview-card-wrapper group/repo "><a class="flex items-center justify-between gap-4 p-2" href="/llava-hf/llava-onevision-qwen2-72b-ov-hf"><div class="w-full truncate"><header class="flex items-center mb-0.5" title="llava-hf/llava-onevision-qwen2-72b-ov-hf"> <img alt="" class="w-3.5 h-3.5 rounded mr-1.5 flex-none flex-none" src="https://cdn-avatars.huggingface.co/v1/production/uploads/5f1158120c833276f61f1a84/HYIF0By10WazlTdVv3xp0.jpeg" crossorigin="anonymous"> <h4 class="text-md truncate font-mono text-black dark:group-hover/repo:text-yellow-500 group-hover/repo:text-indigo-600 text-smd">llava-hf/llava-onevision-qwen2-72b-ov-hf</h4> </header> <div class="mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400"> <svg class="mr-1.5 text-[.8rem]" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M29.707 19.293l-3-3a1 1 0 0 0-1.414 0L16 25.586V30h4.414l9.293-9.293a1 1 0 0 0 0-1.414zM19.586 28H18v-1.586l5-5L24.586 23zM26 21.586L24.414 20L26 18.414L27.586 20z" fill="currentColor"></path><path d="M20 13v-2h-2.142a3.94 3.94 0 0 0-.425-1.019l1.517-1.517l-1.414-1.414l-1.517 1.517A3.944 3.944 0 0 0 15 8.142V6h-2v2.142a3.944 3.944 0 0 0-1.019.425L10.464 7.05L9.05 8.464l1.517 1.517A3.94 3.94 0 0 0 10.142 11H8v2h2.142a3.94 3.94 0 0 0 .425 1.019L9.05 15.536l1.414 1.414l1.517-1.517a3.944 3.944 0 0 0 1.019.425V18h2v-2.142a3.944 3.944 0 0 0 1.019-.425l1.517 1.517l1.414-1.414l-1.517-1.517A3.94 3.94 0 0 0 17.858 13zm-6 1a2 2 0 1 1 2-2a2.002 2.002 0 0 1-2 2z" fill="currentColor"></path><path d="M12 30H6a2.002 2.002 0 0 1-2-2V4a2.002 2.002 0 0 1 2-2h16a2.002 2.002 0 0 1 2 2v10h-2V4H6v24h6z" fill="currentColor"></path></svg> Image-Text-to-Text <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <span class="truncate">Updated <time datetime="2024-12-06T11:09:58" title="Fri, 06 Dec 2024 11:09:58 GMT">5 days ago</time></span> <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none w-3 text-gray-400 mr-0.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 32 32"><path fill="currentColor" d="M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z"></path></svg> 1.54k <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none w-3 text-gray-400 mr-1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" fill="currentColor"><path d="M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13,5.64,15.64a5.7,5.7,0,0,1,0-8,5.48,5.48,0,0,1,7.82,0L16,10.24l2.53-2.58A5.44,5.44,0,0,1,22.45,6m0-2a7.47,7.47,0,0,0-5.34,2.24L16,7.36,14.89,6.24a7.49,7.49,0,0,0-10.68,0,7.72,7.72,0,0,0,0,10.82L16,29,27.79,17.06a7.72,7.72,0,0,0,0-10.82A7.49,7.49,0,0,0,22.45,4Z"></path></svg> 6 </div></div> </a></article></div> <p class="flex-none text-sm italic text-gray-400"><span class="mr-1 rounded bg-gray-100 px-1 font-mono text-xs font-semibold not-italic text-gray-500 dark:bg-gray-800">Note</span> A robust video-text-to-text model that can take in image and video inputs. </p> </div><div><div class="mb-2 overflow-hidden"><article class="overview-card-wrapper group/repo "><a class="flex items-center justify-between gap-4 p-2" href="/llava-hf/LLaVA-NeXT-Video-34B-hf"><div class="w-full truncate"><header class="flex items-center mb-0.5" title="llava-hf/LLaVA-NeXT-Video-34B-hf"> <img alt="" class="w-3.5 h-3.5 rounded mr-1.5 flex-none flex-none" src="https://cdn-avatars.huggingface.co/v1/production/uploads/5f1158120c833276f61f1a84/HYIF0By10WazlTdVv3xp0.jpeg" crossorigin="anonymous"> <h4 class="text-md truncate font-mono text-black dark:group-hover/repo:text-yellow-500 group-hover/repo:text-indigo-600 text-smd">llava-hf/LLaVA-NeXT-Video-34B-hf</h4> </header> <div class="mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400"> <svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 32 32" aria-hidden="true" class="mr-1.5 text-[.8rem]"><path fill="currentColor" d="m30 19l-4 3.2V20a2 2 0 0 0-2-2h-8a2 2 0 0 0-2 2v6a2 2 0 0 0 2 2h8a2 2 0 0 0 2-2v-2.2l4 3.2Zm-14 7v-6h8l.002 6Z"></path><path fill="currentColor" d="M12 28H8V4h8v6a2.006 2.006 0 0 0 2 2h6v3h2v-5a.91.91 0 0 0-.3-.7l-7-7A.9.9 0 0 0 18 2H8a2.006 2.006 0 0 0-2 2v24a2.006 2.006 0 0 0 2 2h4Zm6-23.6l5.6 5.6H18Z"></path></svg> Video-Text-to-Text <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <span class="truncate">Updated <time datetime="2024-11-22T11:39:33" title="Fri, 22 Nov 2024 11:39:33 GMT">19 days ago</time></span> <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none w-3 text-gray-400 mr-0.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 32 32"><path fill="currentColor" d="M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z"></path></svg> 1.73k <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none w-3 text-gray-400 mr-1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" fill="currentColor"><path d="M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13,5.64,15.64a5.7,5.7,0,0,1,0-8,5.48,5.48,0,0,1,7.82,0L16,10.24l2.53-2.58A5.44,5.44,0,0,1,22.45,6m0-2a7.47,7.47,0,0,0-5.34,2.24L16,7.36,14.89,6.24a7.49,7.49,0,0,0-10.68,0,7.72,7.72,0,0,0,0,10.82L16,29,27.79,17.06a7.72,7.72,0,0,0,0-10.82A7.49,7.49,0,0,0,22.45,4Z"></path></svg> 6 </div></div> </a></article></div> <p class="flex-none text-sm italic text-gray-400"><span class="mr-1 rounded bg-gray-100 px-1 font-mono text-xs font-semibold not-italic text-gray-500 dark:bg-gray-800">Note</span> Large and powerful video-text-to-text model that can take in image and video inputs. </p> </div></div> </section> <section class="shadow-alternate-sm rounded-xl border border-gray-100 px-4 pb-4 pt-3 "><div class="mb-1.5 flex items-center whitespace-nowrap font-semibold text-gray-800 lg:text-lg"><svg class="mr-1.5 flex-none text-red-400" style="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 25 25"><ellipse cx="12.5" cy="5" fill="currentColor" fill-opacity="0.25" rx="7.5" ry="2"></ellipse><path d="M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z" fill="currentColor" opacity="0.5"></path><path d="M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z" fill="currentColor" opacity="0.5"></path><path d="M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z" fill="currentColor"></path></svg> <div class="truncate">Datasets for Video-Text-to-Text</div> <a class="btn ml-4 text-sm font-normal" href="/datasets?task_categories=task_categories:video-text-to-text">Browse Datasets (20)</a> </div> <div class="mt-4 space-y-4"><div><div class="mb-2 overflow-hidden"><article class="overview-card-wrapper group/repo "><a class="flex items-center justify-between gap-4 p-2" href="/datasets/lmms-lab/Video-MME"><div class="w-full truncate"><header class="flex items-center mb-0.5" title="lmms-lab/Video-MME"><svg class="mr-1 text-gray-400 group/repo-hover:text-red-500 flex-none -ml-0.5" style="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 25 25"><ellipse cx="12.5" cy="5" fill="currentColor" fill-opacity="0.25" rx="7.5" ry="2"></ellipse><path d="M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z" fill="currentColor" opacity="0.5"></path><path d="M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z" fill="currentColor" opacity="0.5"></path><path d="M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z" fill="currentColor"></path></svg> <h4 class="text-md truncate font-mono text-black dark:group-hover/repo:text-yellow-500 group-hover/repo:text-red-600 text-smd">lmms-lab/Video-MME</h4> </header> <div class="mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400"><svg class="flex-none w-3 text-gray-400 mr-1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 12 12"><path fill="currentColor" d="M2.5 2h7a1 1 0 0 1 1 1v6a1 1 0 0 1-1 1h-7a1 1 0 0 1-1-1V3a1 1 0 0 1 1-1Zm0 2v2h3V4h-3Zm4 0v2h3V4h-3Zm-4 3v2h3V7h-3Zm4 0v2h3V7h-3Z"></path></svg> Viewer <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <span class="truncate">Updated <time datetime="2024-07-04T08:14:20" title="Thu, 04 Jul 2024 08:14:20 GMT">Jul 4</time></span> <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none text-gray-400 mr-1 w-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 12 12"><path fill-rule="evenodd" clip-rule="evenodd" d="M9.84 2.7H2.16a.3.3 0 0 0-.3.3v1.421h8.28V3a.3.3 0 0 0-.3-.3ZM1.86 6.879V5.12h8.28V6.88H1.86Zm0 .7V9a.3.3 0 0 0 .3.3h7.68a.3.3 0 0 0 .3-.3V7.579H1.86ZM2.16 2a1 1 0 0 0-1 1v6a1 1 0 0 0 1 1h7.68a1 1 0 0 0 1-1V3a1 1 0 0 0-1-1H2.16Z"></path></svg> <span>2.7k</span> <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none w-3 text-gray-400 mr-0.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 32 32"><path fill="currentColor" d="M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z"></path></svg> 12.7k <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none w-3 text-gray-400 mr-1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" fill="currentColor"><path d="M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13,5.64,15.64a5.7,5.7,0,0,1,0-8,5.48,5.48,0,0,1,7.82,0L16,10.24l2.53-2.58A5.44,5.44,0,0,1,22.45,6m0-2a7.47,7.47,0,0,0-5.34,2.24L16,7.36,14.89,6.24a7.49,7.49,0,0,0-10.68,0,7.72,7.72,0,0,0,0,10.82L16,29,27.79,17.06a7.72,7.72,0,0,0,0-10.82A7.49,7.49,0,0,0,22.45,4Z"></path></svg> 30 </div></div> </a></article></div> <p class="flex-none text-sm italic text-gray-400"><span class="mr-1 rounded bg-gray-100 px-1 font-mono text-xs font-semibold not-italic text-gray-500 dark:bg-gray-800">Note</span> Multiple-choice questions and answers about videos. </p> </div><div><div class="mb-2 overflow-hidden"><article class="overview-card-wrapper group/repo "><a class="flex items-center justify-between gap-4 p-2" href="/datasets/lmms-lab/VideoChatGPT"><div class="w-full truncate"><header class="flex items-center mb-0.5" title="lmms-lab/VideoChatGPT"><svg class="mr-1 text-gray-400 group/repo-hover:text-red-500 flex-none -ml-0.5" style="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 25 25"><ellipse cx="12.5" cy="5" fill="currentColor" fill-opacity="0.25" rx="7.5" ry="2"></ellipse><path d="M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z" fill="currentColor" opacity="0.5"></path><path d="M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z" fill="currentColor" opacity="0.5"></path><path d="M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z" fill="currentColor"></path></svg> <h4 class="text-md truncate font-mono text-black dark:group-hover/repo:text-yellow-500 group-hover/repo:text-red-600 text-smd">lmms-lab/VideoChatGPT</h4> </header> <div class="mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400"><svg class="flex-none w-3 text-gray-400 mr-1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 12 12"><path fill="currentColor" d="M2.5 2h7a1 1 0 0 1 1 1v6a1 1 0 0 1-1 1h-7a1 1 0 0 1-1-1V3a1 1 0 0 1 1-1Zm0 2v2h3V4h-3Zm4 0v2h3V4h-3Zm-4 3v2h3V7h-3Zm4 0v2h3V7h-3Z"></path></svg> Viewer <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <span class="truncate">Updated <time datetime="2024-04-10T03:18:49" title="Wed, 10 Apr 2024 03:18:49 GMT">Apr 10</time></span> <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none text-gray-400 mr-1 w-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 12 12"><path fill-rule="evenodd" clip-rule="evenodd" d="M9.84 2.7H2.16a.3.3 0 0 0-.3.3v1.421h8.28V3a.3.3 0 0 0-.3-.3ZM1.86 6.879V5.12h8.28V6.88H1.86Zm0 .7V9a.3.3 0 0 0 .3.3h7.68a.3.3 0 0 0 .3-.3V7.579H1.86ZM2.16 2a1 1 0 0 0-1 1v6a1 1 0 0 0 1 1h7.68a1 1 0 0 0 1-1V3a1 1 0 0 0-1-1H2.16Z"></path></svg> <span>3.49k</span> <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none w-3 text-gray-400 mr-0.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 32 32"><path fill="currentColor" d="M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z"></path></svg> 1.72k <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none w-3 text-gray-400 mr-1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" fill="currentColor"><path d="M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13,5.64,15.64a5.7,5.7,0,0,1,0-8,5.48,5.48,0,0,1,7.82,0L16,10.24l2.53-2.58A5.44,5.44,0,0,1,22.45,6m0-2a7.47,7.47,0,0,0-5.34,2.24L16,7.36,14.89,6.24a7.49,7.49,0,0,0-10.68,0,7.72,7.72,0,0,0,0,10.82L16,29,27.79,17.06a7.72,7.72,0,0,0,0-10.82A7.49,7.49,0,0,0,22.45,4Z"></path></svg> 7 </div></div> </a></article></div> <p class="flex-none text-sm italic text-gray-400"><span class="mr-1 rounded bg-gray-100 px-1 font-mono text-xs font-semibold not-italic text-gray-500 dark:bg-gray-800">Note</span> A dataset of instructions and question-answer pairs about videos. </p> </div><div><div class="mb-2 overflow-hidden"><article class="overview-card-wrapper group/repo "><a class="flex items-center justify-between gap-4 p-2" href="/datasets/HuggingFaceFV/finevideo"><div class="w-full truncate"><header class="flex items-center mb-0.5" title="HuggingFaceFV/finevideo"><svg class="mr-1 text-gray-400 group/repo-hover:text-red-500 flex-none -ml-0.5" style="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 25 25"><ellipse cx="12.5" cy="5" fill="currentColor" fill-opacity="0.25" rx="7.5" ry="2"></ellipse><path d="M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z" fill="currentColor" opacity="0.5"></path><path d="M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z" fill="currentColor" opacity="0.5"></path><path d="M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z" fill="currentColor"></path></svg> <h4 class="text-md truncate font-mono text-black dark:group-hover/repo:text-yellow-500 group-hover/repo:text-red-600 text-smd">HuggingFaceFV/finevideo</h4> </header> <div class="mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400"><svg class="flex-none w-3 text-gray-400 mr-1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 12 12"><path fill="currentColor" d="M2.5 2h7a1 1 0 0 1 1 1v6a1 1 0 0 1-1 1h-7a1 1 0 0 1-1-1V3a1 1 0 0 1 1-1Zm0 2v2h3V4h-3Zm4 0v2h3V4h-3Zm-4 3v2h3V7h-3Zm4 0v2h3V7h-3Z"></path></svg> Viewer <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <span class="truncate">Updated <time datetime="2024-11-05T07:54:39" title="Tue, 05 Nov 2024 07:54:39 GMT">Nov 5</time></span> <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none text-gray-400 mr-1 w-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 12 12"><path fill-rule="evenodd" clip-rule="evenodd" d="M9.84 2.7H2.16a.3.3 0 0 0-.3.3v1.421h8.28V3a.3.3 0 0 0-.3-.3ZM1.86 6.879V5.12h8.28V6.88H1.86Zm0 .7V9a.3.3 0 0 0 .3.3h7.68a.3.3 0 0 0 .3-.3V7.579H1.86ZM2.16 2a1 1 0 0 0-1 1v6a1 1 0 0 0 1 1h7.68a1 1 0 0 0 1-1V3a1 1 0 0 0-1-1H2.16Z"></path></svg> <span>39.5k</span> <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none w-3 text-gray-400 mr-0.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 32 32"><path fill="currentColor" d="M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z"></path></svg> 4.96k <span class="px-1.5 text-gray-300 dark:text-gray-500">• </span> <svg class="flex-none w-3 text-gray-400 mr-1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" fill="currentColor"><path d="M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13,5.64,15.64a5.7,5.7,0,0,1,0-8,5.48,5.48,0,0,1,7.82,0L16,10.24l2.53-2.58A5.44,5.44,0,0,1,22.45,6m0-2a7.47,7.47,0,0,0-5.34,2.24L16,7.36,14.89,6.24a7.49,7.49,0,0,0-10.68,0,7.72,7.72,0,0,0,0,10.82L16,29,27.79,17.06a7.72,7.72,0,0,0,0-10.82A7.49,7.49,0,0,0,22.45,4Z"></path></svg> 279 </div></div> </a></article></div> <p class="flex-none text-sm italic text-gray-400"><span class="mr-1 rounded bg-gray-100 px-1 font-mono text-xs font-semibold not-italic text-gray-500 dark:bg-gray-800">Note</span> Large video understanding dataset. </p> </div></div> </section> <section class="shadow-alternate-sm rounded-xl border border-gray-100 px-4 pb-4 pt-3 "><div class="mb-1.5 flex items-center whitespace-nowrap font-semibold text-gray-800 lg:text-lg"><svg class="mr-1.5 flex-none " xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M7.80914 18.7462V24.1907H13.2536V18.7462H7.80914Z" fill="#FF3270"></path><path d="M18.7458 18.7462V24.1907H24.1903V18.7462H18.7458Z" fill="#861FFF"></path><path d="M7.80914 7.80982V13.2543H13.2536V7.80982H7.80914Z" fill="#097EFF"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M4 6.41775C4 5.08246 5.08246 4 6.41775 4H14.6457C15.7626 4 16.7026 4.75724 16.9802 5.78629C18.1505 4.67902 19.7302 4 21.4685 4C25.0758 4 28.0003 6.92436 28.0003 10.5317C28.0003 12.27 27.3212 13.8497 26.2139 15.02C27.243 15.2977 28.0003 16.2376 28.0003 17.3545V25.5824C28.0003 26.9177 26.9177 28.0003 25.5824 28.0003H17.0635H14.9367H6.41775C5.08246 28.0003 4 26.9177 4 25.5824V15.1587V14.9367V6.41775ZM7.80952 7.80952V13.254H13.254V7.80952H7.80952ZM7.80952 24.1907V18.7462H13.254V24.1907H7.80952ZM18.7462 24.1907V18.7462H24.1907V24.1907H18.7462ZM18.7462 10.5317C18.7462 9.0283 19.9651 7.80952 21.4685 7.80952C22.9719 7.80952 24.1907 9.0283 24.1907 10.5317C24.1907 12.0352 22.9719 13.254 21.4685 13.254C19.9651 13.254 18.7462 12.0352 18.7462 10.5317Z" fill="black"></path><path d="M21.4681 7.80982C19.9647 7.80982 18.7458 9.02861 18.7458 10.5321C18.7458 12.0355 19.9647 13.2543 21.4681 13.2543C22.9715 13.2543 24.1903 12.0355 24.1903 10.5321C24.1903 9.02861 22.9715 7.80982 21.4681 7.80982Z" fill="#FFD702"></path></svg> Spaces using Video-Text-to-Text </div> <div class="mt-3 space-y-3"><div><div class="mb-2"><a href="/spaces/llava-hf/video-llava" class="flex min-w-0 items-center whitespace-nowrap rounded-lg border border-gray-100 px-2 text-xs leading-relaxed text-gray-500 shadow-sm hover:text-gray-800 dark:hover:text-gray-200 md:text-sm md:leading-relaxed inline-flex"><div class="mr-2 flex-none text-[0.6rem]">🐨</div> <div class="truncate group-hover:underline">llava-hf/video-llava</div></a></div> <p class="flex-none text-sm italic text-gray-400"><span class="mr-1 rounded bg-gray-100 px-1 font-mono text-xs font-semibold not-italic text-gray-500 dark:bg-gray-800">Note</span> An application to chat with a video-text-to-text model. </p> </div><div><div class="mb-2"><a href="/spaces/opencompass/openvlm_video_leaderboard" class="flex min-w-0 items-center whitespace-nowrap rounded-lg border border-gray-100 px-2 text-xs leading-relaxed text-gray-500 shadow-sm hover:text-gray-800 dark:hover:text-gray-200 md:text-sm md:leading-relaxed inline-flex"><div class="mr-2 flex-none text-[0.6rem]">🌎</div> <div class="truncate group-hover:underline">opencompass/openvlm_video_leaderboard</div></a></div> <p class="flex-none text-sm italic text-gray-400"><span class="mr-1 rounded bg-gray-100 px-1 font-mono text-xs font-semibold not-italic text-gray-500 dark:bg-gray-800">Note</span> A leaderboard for various video-text-to-text models. </p> </div></div> </section> <section class="shadow-alternate-sm rounded-xl border border-gray-100 px-4 pb-4 pt-3 "><div class="mb-1.5 flex items-center whitespace-nowrap font-semibold text-gray-800 lg:text-lg"><svg class="mr-1.5 flex-none text-green-400" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-quaternary" d="M6 23H2a1 1 0 0 1-1-1v-8a1 1 0 0 1 1-1h4a1 1 0 0 1 1 1v8a1 1 0 0 1-1 1z" opacity=".25" fill="currentColor"></path><path class="uim-primary" d="M14 23h-4a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1h4a1 1 0 0 1 1 1v20a1 1 0 0 1-1 1z" fill="currentColor"></path><path class="uim-tertiary" d="M22 23h-4a1 1 0 0 1-1-1V10a1 1 0 0 1 1-1h4a1 1 0 0 1 1 1v12a1 1 0 0 1-1 1z" opacity=".5" fill="currentColor"></path></svg> Metrics for Video-Text-to-Text </div> <p class="mb-2 text-gray-500">No example metric is defined for this task.</p> <p class="flex-none text-sm italic text-gray-400"><span class="mr-1 rounded bg-gray-100 px-1 font-mono text-xs font-semibold not-italic text-gray-500 dark:bg-gray-800">Note</span> Contribute by proposing a metric for this task !</p> </section></div></div></main> <footer class="b-12 mb-2 flex border-t border-gray-100 md:h-14"><nav class="container flex flex-col justify-between space-y-2 py-6 text-gray-500 md:flex-row md:items-center md:space-y-0 md:py-0 md:text-sm"><div class="font-semibold text-black md:hidden">Company</div> <div class="order-last pt-6 text-gray-400 md:order-none md:pt-0" href="Terms">© Hugging Face</div> <a class="hover:underline" href="/terms-of-service">TOS</a> <a class="hover:underline" href="/privacy">Privacy</a> <a class="hover:underline" href="/huggingface">About</a> <a class="hover:underline" href="https://apply.workable.com/huggingface/">Jobs</a> <a href="/" class="group order-first flex-none pb-6 md:order-none md:pb-0"><svg class="h-7 w-7 transition-transform group-hover:-translate-y-px" viewBox="0 0 95 88" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M47.2119 76.5C66.4037 76.5 81.9619 60.9419 81.9619 41.75C81.9619 22.5581 66.4037 7 47.2119 7C28.02 7 12.4619 22.5581 12.4619 41.75C12.4619 60.9419 28.02 76.5 47.2119 76.5Z" fill="#FFD21E"></path><path d="M81.9619 41.75C81.9619 22.5581 66.4037 7 47.2119 7C28.02 7 12.4619 22.5581 12.4619 41.75C12.4619 60.9419 28.02 76.5 47.2119 76.5C66.4037 76.5 81.9619 60.9419 81.9619 41.75ZM8.46185 41.75C8.46185 20.349 25.8108 3 47.2119 3C68.6129 3 85.9619 20.349 85.9619 41.75C85.9619 63.151 68.6129 80.5 47.2119 80.5C25.8108 80.5 8.46185 63.151 8.46185 41.75Z" fill="#FF9D0B"></path><path d="M58.5024 32.2915C59.7768 32.7415 60.2839 35.3615 61.5713 34.6769C64.0095 33.3805 64.9351 30.353 63.6387 27.9148C62.3423 25.4767 59.3148 24.5511 56.8766 25.8475C54.4384 27.1439 53.5128 30.1714 54.8092 32.6096C55.4211 33.7604 57.3632 31.8892 58.5024 32.2915Z" fill="#3A3B45"></path><path d="M34.9454 32.2915C33.671 32.7415 33.164 35.3615 31.8766 34.6769C29.4384 33.3805 28.5128 30.353 29.8092 27.9148C31.1056 25.4767 34.1331 24.5511 36.5713 25.8475C39.0095 27.1439 39.9351 30.1714 38.6387 32.6096C38.0268 33.7604 36.0846 31.8892 34.9454 32.2915Z" fill="#3A3B45"></path><path d="M46.9619 56.289C56.7903 56.289 59.9619 47.5261 59.9619 43.0262C59.9619 40.6875 58.3898 41.4236 55.8718 42.6702C53.5449 43.8222 50.4102 45.4101 46.9619 45.4101C39.7822 45.4101 33.9619 38.5263 33.9619 43.0262C33.9619 47.5261 37.1334 56.289 46.9619 56.289Z" fill="#3A3B45"></path><mask id="mask0" mask-type="alpha" maskUnits="userSpaceOnUse" x="33" y="41" width="27" height="16"><path d="M46.9619 56.289C56.7903 56.289 59.9619 47.5261 59.9619 43.0262C59.9619 40.6875 58.3898 41.4236 55.8718 42.6702C53.5449 43.8222 50.4102 45.4101 46.9619 45.4101C39.7822 45.4101 33.9619 38.5263 33.9619 43.0262C33.9619 47.5261 37.1334 56.289 46.9619 56.289Z" fill="white"></path></mask><g mask="url(#mask0)"><path d="M47.2119 66.5C52.0018 66.5 55.8848 62.617 55.8848 57.8271C55.8848 54.0962 53.5291 50.9156 50.224 49.6915C50.1023 49.6464 49.9794 49.604 49.8553 49.5643C49.0219 49.2979 48.1337 52.1623 47.2119 52.1623C46.3506 52.1623 45.5186 49.2797 44.7332 49.5135C41.151 50.5799 38.5389 53.8984 38.5389 57.8271C38.5389 62.617 42.4219 66.5 47.2119 66.5Z" fill="#F94040"></path></g><path d="M70.7119 37C72.5068 37 73.9619 35.5449 73.9619 33.75C73.9619 31.9551 72.5068 30.5 70.7119 30.5C68.9169 30.5 67.4619 31.9551 67.4619 33.75C67.4619 35.5449 68.9169 37 70.7119 37Z" fill="#FF9D0B"></path><path d="M24.2119 37C26.0068 37 27.4619 35.5449 27.4619 33.75C27.4619 31.9551 26.0068 30.5 24.2119 30.5C22.4169 30.5 20.9619 31.9551 20.9619 33.75C20.9619 35.5449 22.4169 37 24.2119 37Z" fill="#FF9D0B"></path><path class="origin-bottom-right transition-transform group-hover:-rotate-6" d="M17.5238 48C15.9048 48 14.4578 48.665 13.4488 49.871C12.8248 50.618 12.1728 51.822 12.1198 53.625C11.4408 53.43 10.7878 53.321 10.1778 53.321C8.6278 53.321 7.2278 53.915 6.2378 54.994C4.9658 56.379 4.4008 58.081 4.6468 59.784C4.7638 60.595 5.0348 61.322 5.4398 61.995C4.5858 62.686 3.9568 63.648 3.6528 64.805C3.4148 65.712 3.1708 67.601 4.4448 69.547C4.3638 69.674 4.2878 69.806 4.2168 69.941C3.4508 71.395 3.4018 73.038 4.0778 74.568C5.1028 76.887 7.6498 78.714 12.5958 80.675C15.6728 81.895 18.4878 82.675 18.5128 82.682C22.5808 83.737 26.2598 84.273 29.4448 84.273C35.2988 84.273 39.4898 82.48 41.9018 78.944C45.7838 73.25 45.2288 68.042 40.2058 63.022C37.4258 60.244 35.5778 56.148 35.1928 55.249C34.4168 52.587 32.3648 49.628 28.9538 49.628H28.9528C28.6658 49.628 28.3758 49.651 28.0898 49.696C26.5958 49.931 25.2898 50.791 24.3568 52.085C23.3498 50.833 22.3718 49.837 21.4868 49.275C20.1528 48.429 18.8198 48 17.5238 48ZM17.5238 52C18.0338 52 18.6568 52.217 19.3438 52.653C21.4768 54.006 25.5928 61.081 27.0998 63.833C27.6048 64.755 28.4678 65.145 29.2448 65.145C30.7868 65.145 31.9908 63.612 29.3858 61.664C25.4688 58.733 26.8428 53.942 28.7128 53.647C28.7948 53.634 28.8758 53.628 28.9538 53.628C30.6538 53.628 31.4038 56.558 31.4038 56.558C31.4038 56.558 33.6018 62.078 37.3778 65.851C41.1538 69.625 41.3488 72.654 38.5968 76.69C36.7198 79.442 33.1268 80.273 29.4448 80.273C25.6258 80.273 21.7108 79.379 19.5168 78.81C19.4088 78.782 6.0658 75.013 7.7558 71.805C8.0398 71.266 8.5078 71.05 9.0968 71.05C11.4768 71.05 15.8058 74.592 17.6668 74.592C18.0828 74.592 18.3758 74.415 18.4958 73.983C19.2888 71.138 6.4388 69.942 7.5218 65.821C7.7128 65.092 8.2308 64.796 8.9588 64.797C12.1038 64.797 19.1598 70.328 20.6388 70.328C20.7518 70.328 20.8328 70.295 20.8768 70.225C21.6178 69.029 21.2118 68.194 15.9888 65.033C10.7658 61.871 7.0998 59.969 9.1848 57.699C9.4248 57.437 9.7648 57.321 10.1778 57.321C13.3488 57.322 20.8408 64.14 20.8408 64.14C20.8408 64.14 22.8628 66.243 24.0858 66.243C24.3668 66.243 24.6058 66.132 24.7678 65.858C25.6348 64.396 16.7148 57.636 16.2118 54.847C15.8708 52.957 16.4508 52 17.5238 52Z" fill="#FF9D0B"></path><path class="origin-bottom-right transition-transform group-hover:-rotate-6" d="M38.5967 76.6898C41.3487 72.6538 41.1537 69.6248 37.3777 65.8508C33.6017 62.0778 31.4037 56.5578 31.4037 56.5578C31.4037 56.5578 30.5827 53.3518 28.7127 53.6468C26.8427 53.9418 25.4697 58.7328 29.3867 61.6638C33.3037 64.5938 28.6067 66.5848 27.0997 63.8328C25.5927 61.0808 21.4777 54.0058 19.3437 52.6528C17.2107 51.2998 15.7087 52.0578 16.2117 54.8468C16.7147 57.6358 25.6357 64.3958 24.7677 65.8588C23.8997 67.3208 20.8407 64.1398 20.8407 64.1398C20.8407 64.1398 11.2687 55.4288 9.18465 57.6988C7.10065 59.9688 10.7657 61.8708 15.9887 65.0328C21.2127 68.1938 21.6177 69.0288 20.8767 70.2248C20.1347 71.4208 8.60465 61.6998 7.52165 65.8208C6.43965 69.9418 19.2887 71.1378 18.4957 73.9828C17.7027 76.8288 9.44465 68.5978 7.75565 71.8048C6.06565 75.0128 19.4087 78.7818 19.5167 78.8098C23.8267 79.9278 34.7727 82.2968 38.5967 76.6898Z" fill="#FFD21E"></path><path class="origin-bottom-left transition-transform group-hover:rotate-6" d="M77.3999 48C79.0189 48 80.4659 48.665 81.4749 49.871C82.0989 50.618 82.7509 51.822 82.8039 53.625C83.4829 53.43 84.1359 53.321 84.7459 53.321C86.2959 53.321 87.6959 53.915 88.6859 54.994C89.9579 56.379 90.5229 58.081 90.2769 59.784C90.1599 60.595 89.8889 61.322 89.4839 61.995C90.3379 62.686 90.9669 63.648 91.2709 64.805C91.5089 65.712 91.7529 67.601 90.4789 69.547C90.5599 69.674 90.6359 69.806 90.7069 69.941C91.4729 71.395 91.5219 73.038 90.8459 74.568C89.8209 76.887 87.2739 78.714 82.3279 80.675C79.2509 81.895 76.4359 82.675 76.4109 82.682C72.3429 83.737 68.6639 84.273 65.4789 84.273C59.6249 84.273 55.4339 82.48 53.0219 78.944C49.1399 73.25 49.6949 68.042 54.7179 63.022C57.4979 60.244 59.3459 56.148 59.7309 55.249C60.5069 52.587 62.5589 49.628 65.9699 49.628H65.9709C66.2579 49.628 66.5479 49.651 66.8339 49.696C68.3279 49.931 69.6339 50.791 70.5669 52.085C71.5739 50.833 72.5519 49.837 73.4369 49.275C74.7709 48.429 76.1039 48 77.3999 48ZM77.3999 52C76.8899 52 76.2669 52.217 75.5799 52.653C73.4469 54.006 69.3309 61.081 67.8239 63.833C67.3189 64.755 66.4559 65.145 65.6789 65.145C64.1369 65.145 62.9329 63.612 65.5379 61.664C69.4549 58.733 68.0809 53.942 66.2109 53.647C66.1289 53.634 66.0479 53.628 65.9699 53.628C64.2699 53.628 63.5199 56.558 63.5199 56.558C63.5199 56.558 61.3219 62.078 57.5459 65.851C53.7699 69.625 53.5749 72.654 56.3269 76.69C58.2039 79.442 61.7969 80.273 65.4789 80.273C69.2979 80.273 73.2129 79.379 75.4069 78.81C75.5149 78.782 88.8579 75.013 87.1679 71.805C86.8839 71.266 86.4159 71.05 85.8269 71.05C83.4469 71.05 79.1179 74.592 77.2569 74.592C76.8409 74.592 76.5479 74.415 76.4279 73.983C75.6349 71.138 88.4849 69.942 87.4019 65.821C87.2109 65.092 86.6929 64.796 85.9649 64.797C82.8199 64.797 75.7639 70.328 74.2849 70.328C74.1719 70.328 74.0909 70.295 74.0469 70.225C73.3059 69.029 73.7119 68.194 78.9349 65.033C84.1579 61.871 87.8239 59.969 85.7389 57.699C85.4989 57.437 85.1589 57.321 84.7459 57.321C81.5749 57.322 74.0829 64.14 74.0829 64.14C74.0829 64.14 72.0609 66.243 70.8379 66.243C70.5569 66.243 70.3179 66.132 70.1559 65.858C69.2889 64.396 78.2089 57.636 78.7119 54.847C79.0529 52.957 78.4729 52 77.3999 52Z" fill="#FF9D0B"></path><path class="origin-bottom-left transition-transform group-hover:rotate-6" d="M56.3271 76.6898C53.5751 72.6538 53.7701 69.6248 57.5461 65.8508C61.3221 62.0778 63.5201 56.5578 63.5201 56.5578C63.5201 56.5578 64.3411 53.3518 66.2111 53.6468C68.0811 53.9418 69.4541 58.7328 65.5371 61.6638C61.6201 64.5938 66.3171 66.5848 67.8241 63.8328C69.3311 61.0808 73.4461 54.0058 75.5801 52.6528C77.7131 51.2998 79.2151 52.0578 78.7121 54.8468C78.2091 57.6358 69.2881 64.3958 70.1561 65.8588C71.0241 67.3208 74.0831 64.1398 74.0831 64.1398C74.0831 64.1398 83.6551 55.4288 85.7391 57.6988C87.8231 59.9688 84.1581 61.8708 78.9351 65.0328C73.7111 68.1938 73.3061 69.0288 74.0471 70.2248C74.7891 71.4208 86.3191 61.6998 87.4021 65.8208C88.4841 69.9418 75.6351 71.1378 76.4281 73.9828C77.2211 76.8288 85.4791 68.5978 87.1681 71.8048C88.8581 75.0128 75.5151 78.7818 75.4071 78.8098C71.0971 79.9278 60.1511 82.2968 56.3271 76.6898Z" fill="#FFD21E"></path></svg></a> <div class="pt-6 font-semibold text-black md:hidden md:pt-0">Website</div> <a class="hover:underline" href="/models">Models</a> <a class="hover:underline" href="/datasets">Datasets</a> <a class="hover:underline" href="/spaces">Spaces</a> <a class="hover:underline" href="/pricing">Pricing</a> <a class="hover:underline" href="/docs">Docs</a></nav></footer></div> <script> import("\/front\/build\/kube-6fe56a2\/index.js"); window.moonSha = "kube-6fe56a2\/"; window.__hf_deferred = {}; </script> <!-- Stripe --> <script> if (["hf.co", "huggingface.co"].includes(window.location.hostname)) { const script = document.createElement("script"); script.src = "https://js.stripe.com/v3/"; script.async = true; document.head.appendChild(script); } </script> </body> </html>