CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 182 results for author: <span class="mathjax">Dai, Q</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Dai%2C+Q">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Dai, Q"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Dai%2C+Q&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Dai, Q"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Dai%2C+Q&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Dai%2C+Q&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Dai%2C+Q&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Dai%2C+Q&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Dai%2C+Q&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21442">arXiv:2503.21442</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.21442">pdf</a>, <a href="https://arxiv.org/format/2503.21442">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RainyGS: Efficient Rain Synthesis with Physically-Based Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qiyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+X">Xingyu Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Q">Qianfan Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Wenzheng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Baoquan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+M">Mengyu Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21442v1-abstract-short" style="display: inline;"> We consider the problem of adding dynamic rain effects to in-the-wild scenes in a physically-correct manner. Recent advances in scene modeling have made significant progress, with NeRF and 3DGS techniques emerging as powerful tools for reconstructing complex scenes. However, while effective for novel view synthesis, these methods typically struggle with challenging scene editing tasks, such as phy&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21442v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21442v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21442v1-abstract-full" style="display: none;"> We consider the problem of adding dynamic rain effects to in-the-wild scenes in a physically-correct manner. Recent advances in scene modeling have made significant progress, with NeRF and 3DGS techniques emerging as powerful tools for reconstructing complex scenes. However, while effective for novel view synthesis, these methods typically struggle with challenging scene editing tasks, such as physics-based rain simulation. In contrast, traditional physics-based simulations can generate realistic rain effects, such as raindrops and splashes, but they often rely on skilled artists to carefully set up high-fidelity scenes. This process lacks flexibility and scalability, limiting its applicability to broader, open-world environments. In this work, we introduce RainyGS, a novel approach that leverages the strengths of both physics-based modeling and 3DGS to generate photorealistic, dynamic rain effects in open-world scenes with physical accuracy. At the core of our method is the integration of physically-based raindrop and shallow water simulation techniques within the fast 3DGS rendering framework, enabling realistic and efficient simulations of raindrop behavior, splashes, and reflections. Our method supports synthesizing rain effects at over 30 fps, offering users flexible control over rain intensity -- from light drizzles to heavy downpours. We demonstrate that RainyGS performs effectively for both real-world outdoor scenes and large-scale driving scenarios, delivering more photorealistic and physically-accurate rain effects compared to state-of-the-art methods. Project page can be found at https://pku-vcl-geometry.github.io/RainyGS/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21442v1-abstract-full').style.display = 'none'; document.getElementById('2503.21442v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16421">arXiv:2503.16421</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.16421">pdf</a>, <a href="https://arxiv.org/format/2503.16421">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> MagicMotion: Controllable Video Generation with Dense-to-Sparse Trajectory Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Quanhao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xing%2C+Z">Zhen Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zuxuan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16421v1-abstract-short" style="display: inline;"> Recent advances in video generation have led to remarkable improvements in visual quality and temporal coherence. Upon this, trajectory-controllable video generation has emerged to enable precise object motion control through explicitly defined spatial paths. However, existing methods struggle with complex object movements and multi-object motion control, resulting in imprecise trajectory adherenc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16421v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16421v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16421v1-abstract-full" style="display: none;"> Recent advances in video generation have led to remarkable improvements in visual quality and temporal coherence. Upon this, trajectory-controllable video generation has emerged to enable precise object motion control through explicitly defined spatial paths. However, existing methods struggle with complex object movements and multi-object motion control, resulting in imprecise trajectory adherence, poor object consistency, and compromised visual quality. Furthermore, these methods only support trajectory control in a single format, limiting their applicability in diverse scenarios. Additionally, there is no publicly available dataset or benchmark specifically tailored for trajectory-controllable video generation, hindering robust training and systematic evaluation. To address these challenges, we introduce MagicMotion, a novel image-to-video generation framework that enables trajectory control through three levels of conditions from dense to sparse: masks, bounding boxes, and sparse boxes. Given an input image and trajectories, MagicMotion seamlessly animates objects along defined trajectories while maintaining object consistency and visual quality. Furthermore, we present MagicData, a large-scale trajectory-controlled video dataset, along with an automated pipeline for annotation and filtering. We also introduce MagicBench, a comprehensive benchmark that assesses both video quality and trajectory control accuracy across different numbers of objects. Extensive experiments demonstrate that MagicMotion outperforms previous methods across various metrics. Our project page are publicly available at https://quanhaol.github.io/magicmotion-site. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16421v1-abstract-full').style.display = 'none'; document.getElementById('2503.16421v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14229">arXiv:2503.14229</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.14229">pdf</a>, <a href="https://arxiv.org/format/2503.14229">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> HA-VLN: A Benchmark for Human-Aware Navigation in Discrete-Continuous Environments with Dynamic Multi-Human Interactions, Real-World Validation, and an Open Leaderboard </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yifei Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+F">Fengyi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Q">Qi He</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Heng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Minghan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Z">Zebang Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yuxuan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+J">Jingdong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Z">Zhi-Qi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Hauptmann%2C+A+G">Alexander G Hauptmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14229v1-abstract-short" style="display: inline;"> Vision-and-Language Navigation (VLN) systems often focus on either discrete (panoramic) or continuous (free-motion) paradigms alone, overlooking the complexities of human-populated, dynamic environments. We introduce a unified Human-Aware VLN (HA-VLN) benchmark that merges these paradigms under explicit social-awareness constraints. Our contributions include: 1. A standardized task definition that&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14229v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14229v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14229v1-abstract-full" style="display: none;"> Vision-and-Language Navigation (VLN) systems often focus on either discrete (panoramic) or continuous (free-motion) paradigms alone, overlooking the complexities of human-populated, dynamic environments. We introduce a unified Human-Aware VLN (HA-VLN) benchmark that merges these paradigms under explicit social-awareness constraints. Our contributions include: 1. A standardized task definition that balances discrete-continuous navigation with personal-space requirements; 2. An enhanced human motion dataset (HAPS 2.0) and upgraded simulators capturing realistic multi-human interactions, outdoor contexts, and refined motion-language alignment; 3. Extensive benchmarking on 16,844 human-centric instructions, revealing how multi-human dynamics and partial observability pose substantial challenges for leading VLN agents; 4. Real-world robot tests validating sim-to-real transfer in crowded indoor spaces; and 5. A public leaderboard supporting transparent comparisons across discrete and continuous tasks. Empirical results show improved navigation success and fewer collisions when social context is integrated, underscoring the need for human-centric design. By releasing all datasets, simulators, agent code, and evaluation tools, we aim to advance safer, more capable, and socially responsible VLN research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14229v1-abstract-full').style.display = 'none'; document.getElementById('2503.14229v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, website: https://ha-vln-project.vercel.app/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11513">arXiv:2503.11513</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.11513">pdf</a>, <a href="https://arxiv.org/format/2503.11513">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HiTVideo: Hierarchical Tokenizers for Enhancing Text-to-Video Generation with Autoregressive Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Ziqin Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yuqing Yang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+T">Tianyu He</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+H">Houwen Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+K">Kai Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+L">Lili Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+L">Lingqiao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11513v1-abstract-short" style="display: inline;"> Text-to-video generation poses significant challenges due to the inherent complexity of video data, which spans both temporal and spatial dimensions. It introduces additional redundancy, abrupt variations, and a domain gap between language and vision tokens while generation. Addressing these challenges requires an effective video tokenizer that can efficiently encode video data while preserving es&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11513v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11513v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11513v1-abstract-full" style="display: none;"> Text-to-video generation poses significant challenges due to the inherent complexity of video data, which spans both temporal and spatial dimensions. It introduces additional redundancy, abrupt variations, and a domain gap between language and vision tokens while generation. Addressing these challenges requires an effective video tokenizer that can efficiently encode video data while preserving essential semantic and spatiotemporal information, serving as a critical bridge between text and vision. Inspired by the observation in VQ-VAE-2 and workflows of traditional animation, we propose HiTVideo for text-to-video generation with hierarchical tokenizers. It utilizes a 3D causal VAE with a multi-layer discrete token framework, encoding video content into hierarchically structured codebooks. Higher layers capture semantic information with higher compression, while lower layers focus on fine-grained spatiotemporal details, striking a balance between compression efficiency and reconstruction quality. Our approach efficiently encodes longer video sequences (e.g., 8 seconds, 64 frames), reducing bits per pixel (bpp) by approximately 70\% compared to baseline tokenizers, while maintaining competitive reconstruction quality. We explore the trade-offs between compression and reconstruction, while emphasizing the advantages of high-compressed semantic tokens in text-to-video tasks. HiTVideo aims to address the potential limitations of existing video tokenizers in text-to-video generation tasks, striving for higher compression ratios and simplify LLMs modeling under language guidance, offering a scalable and promising framework for advancing text to video generation. Demo page: https://ziqinzhou66.github.io/project/HiTVideo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11513v1-abstract-full').style.display = 'none'; document.getElementById('2503.11513v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01743">arXiv:2503.01743</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.01743">pdf</a>, <a href="https://arxiv.org/format/2503.01743">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Microsoft"> Microsoft</a>, <a href="/search/cs?searchtype=author&amp;query=%3A"> :</a>, <a href="/search/cs?searchtype=author&amp;query=Abouelenin%2C+A">Abdelrahman Abouelenin</a>, <a href="/search/cs?searchtype=author&amp;query=Ashfaq%2C+A">Atabak Ashfaq</a>, <a href="/search/cs?searchtype=author&amp;query=Atkinson%2C+A">Adam Atkinson</a>, <a href="/search/cs?searchtype=author&amp;query=Awadalla%2C+H">Hany Awadalla</a>, <a href="/search/cs?searchtype=author&amp;query=Bach%2C+N">Nguyen Bach</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+J">Jianmin Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Benhaim%2C+A">Alon Benhaim</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+M">Martin Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhary%2C+V">Vishrav Chaudhary</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Congcong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+D">Dongdong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Junkun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Weizhu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Chun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yi-ling Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+X">Xiyang Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+R">Ruchao Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+M">Mei Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+M">Min Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Garg%2C+A">Amit Garg</a>, <a href="/search/cs?searchtype=author&amp;query=Goswami%2C+A">Abhishek Goswami</a> , et al. (51 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01743v2-abstract-short" style="display: inline;"> We introduce Phi-4-Mini and Phi-4-Multimodal, compact yet highly capable language and multimodal models. Phi-4-Mini is a 3.8-billion-parameter language model trained on high-quality web and synthetic data, significantly outperforming recent open-source models of similar size and matching the performance of models twice its size on math and coding tasks requiring complex reasoning. This achievement&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01743v2-abstract-full').style.display = 'inline'; document.getElementById('2503.01743v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01743v2-abstract-full" style="display: none;"> We introduce Phi-4-Mini and Phi-4-Multimodal, compact yet highly capable language and multimodal models. Phi-4-Mini is a 3.8-billion-parameter language model trained on high-quality web and synthetic data, significantly outperforming recent open-source models of similar size and matching the performance of models twice its size on math and coding tasks requiring complex reasoning. This achievement is driven by a carefully curated synthetic data recipe emphasizing high-quality math and coding datasets. Compared to its predecessor, Phi-3.5-Mini, Phi-4-Mini features an expanded vocabulary size of 200K tokens to better support multilingual applications, as well as group query attention for more efficient long-sequence generation. Phi-4-Multimodal is a multimodal model that integrates text, vision, and speech/audio input modalities into a single model. Its novel modality extension approach leverages LoRA adapters and modality-specific routers to allow multiple inference modes combining various modalities without interference. For example, it now ranks first in the OpenASR leaderboard to date, although the LoRA component of the speech/audio modality has just 460 million parameters. Phi-4-Multimodal supports scenarios involving (vision + language), (vision + speech), and (speech/audio) inputs, outperforming larger vision-language and speech-language models on a wide range of tasks. Additionally, we experiment to further train Phi-4-Mini to enhance its reasoning capabilities. Despite its compact 3.8-billion-parameter size, this experimental version achieves reasoning performance on par with or surpassing significantly larger models, including DeepSeek-R1-Distill-Qwen-7B and DeepSeek-R1-Distill-Llama-8B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01743v2-abstract-full').style.display = 'none'; document.getElementById('2503.01743v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">39 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01490">arXiv:2503.01490</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.01490">pdf</a>, <a href="https://arxiv.org/format/2503.01490">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Improving Retrospective Language Agents via Joint Policy Gradient Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feng%2C+X">Xueyang Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+B">Bo Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jiakai Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zhenhua Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01490v1-abstract-short" style="display: inline;"> In recent research advancements within the community, large language models (LLMs) have sparked great interest in creating autonomous agents. However, current prompt-based agents often heavily rely on large-scale LLMs. Meanwhile, although fine-tuning methods significantly enhance the capabilities of smaller LLMs, the fine-tuned agents often lack the potential for self-reflection and self-improveme&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01490v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01490v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01490v1-abstract-full" style="display: none;"> In recent research advancements within the community, large language models (LLMs) have sparked great interest in creating autonomous agents. However, current prompt-based agents often heavily rely on large-scale LLMs. Meanwhile, although fine-tuning methods significantly enhance the capabilities of smaller LLMs, the fine-tuned agents often lack the potential for self-reflection and self-improvement. To address these challenges, we introduce a novel agent framework named RetroAct, which is a framework that jointly optimizes both task-planning and self-reflective evolution capabilities in language agents. Specifically, we develop a two-stage joint optimization process that integrates imitation learning and reinforcement learning, and design an off-policy joint policy gradient optimization algorithm with imitation learning regularization to enhance the data efficiency and training stability in agent tasks. RetroAct significantly improves the performance of open-source models, reduces dependency on closed-source LLMs, and enables fine-tuned agents to learn and evolve continuously. We conduct extensive experiments across various testing environments, demonstrating RetroAct has substantial improvements in task performance and decision-making processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01490v1-abstract-full').style.display = 'none'; document.getElementById('2503.01490v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00334">arXiv:2503.00334</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.00334">pdf</a>, <a href="https://arxiv.org/format/2503.00334">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696410.3714802">10.1145/3696410.3714802 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MCNet: Monotonic Calibration Networks for Expressive Uncertainty Calibration in Online Advertising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+J">Jiaren Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+Z">Zhaocheng Du</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jieming Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+C">Chengxiao Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiao-Ming Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zhenhua Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00334v1-abstract-short" style="display: inline;"> In online advertising, uncertainty calibration aims to adjust a ranking model&#39;s probability predictions to better approximate the true likelihood of an event, e.g., a click or a conversion. However, existing calibration approaches may lack the ability to effectively model complex nonlinear relations, consider context features, and achieve balanced performance across different data subsets. To tack&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00334v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00334v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00334v1-abstract-full" style="display: none;"> In online advertising, uncertainty calibration aims to adjust a ranking model&#39;s probability predictions to better approximate the true likelihood of an event, e.g., a click or a conversion. However, existing calibration approaches may lack the ability to effectively model complex nonlinear relations, consider context features, and achieve balanced performance across different data subsets. To tackle these challenges, we introduce a novel model called Monotonic Calibration Networks, featuring three key designs: a monotonic calibration function (MCF), an order-preserving regularizer, and a field-balance regularizer. The nonlinear MCF is capable of naturally modeling and universally approximating the intricate relations between uncalibrated predictions and the posterior probabilities, thus being much more expressive than existing methods. MCF can also integrate context features using a flexible model architecture, thereby achieving context awareness. The order-preserving and field-balance regularizers promote the monotonic relationship between adjacent bins and the balanced calibration performance on data subsets, respectively. Experimental results on both public and industrial datasets demonstrate the superior performance of our method in generating well-calibrated probability predictions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00334v1-abstract-full').style.display = 'none'; document.getElementById('2503.00334v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.0 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> THE ACM WEB CONFERENCE 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14735">arXiv:2502.14735</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.14735">pdf</a>, <a href="https://arxiv.org/format/2502.14735">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696410.3714933">10.1145/3696410.3714933 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> EAGER-LLM: Enhancing Large Language Models as Recommenders through Exogenous Behavior-Semantic Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hong%2C+M">Minjie Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+Y">Yan Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jieming Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Ye Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+S">Sihang Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zhenhua Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhimeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14735v1-abstract-short" style="display: inline;"> Large language models (LLMs) are increasingly leveraged as foundational backbones in the development of advanced recommender systems, offering enhanced capabilities through their extensive knowledge and reasoning. Existing llm-based recommender systems (RSs) often face challenges due to the significant differences between the linguistic semantics of pre-trained LLMs and the collaborative semantics&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14735v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14735v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14735v1-abstract-full" style="display: none;"> Large language models (LLMs) are increasingly leveraged as foundational backbones in the development of advanced recommender systems, offering enhanced capabilities through their extensive knowledge and reasoning. Existing llm-based recommender systems (RSs) often face challenges due to the significant differences between the linguistic semantics of pre-trained LLMs and the collaborative semantics essential for RSs. These systems use pre-trained linguistic semantics but learn collaborative semantics from scratch via the llm-Backbone. However, LLMs are not designed for recommendations, leading to inefficient collaborative learning, weak result correlations, and poor integration of traditional RS features. To address these challenges, we propose EAGER-LLM, a decoder-only llm-based generative recommendation framework that integrates endogenous and exogenous behavioral and semantic information in a non-intrusive manner. Specifically, we propose 1)dual-source knowledge-rich item indices that integrates indexing sequences for exogenous signals, enabling efficient link-wide processing; 2)non-invasive multiscale alignment reconstruction tasks guide the model toward a deeper understanding of both collaborative and semantic signals; 3)an annealing adapter designed to finely balance the model&#39;s recommendation performance with its comprehension capabilities. We demonstrate EAGER-LLM&#39;s effectiveness through rigorous testing on three public benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14735v1-abstract-full').style.display = 'none'; document.getElementById('2502.14735v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures, accpeted by WWW 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11528">arXiv:2502.11528</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11528">pdf</a>, <a href="https://arxiv.org/format/2502.11528">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Personalized Large Language Models: Progress and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiahong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+Z">Zexuan Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhongyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jieming Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+M">Minda Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+M">Menglin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=King%2C+I">Irwin King</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11528v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) excel in handling general knowledge tasks, yet they struggle with user-specific personalization, such as understanding individual emotions, writing styles, and preferences. Personalized Large Language Models (PLLMs) tackle these challenges by leveraging individual user data, such as user profiles, historical dialogues, content, and interactions, to deliver responses th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11528v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11528v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11528v1-abstract-full" style="display: none;"> Large Language Models (LLMs) excel in handling general knowledge tasks, yet they struggle with user-specific personalization, such as understanding individual emotions, writing styles, and preferences. Personalized Large Language Models (PLLMs) tackle these challenges by leveraging individual user data, such as user profiles, historical dialogues, content, and interactions, to deliver responses that are contextually relevant and tailored to each user&#39;s specific needs. This is a highly valuable research topic, as PLLMs can significantly enhance user satisfaction and have broad applications in conversational agents, recommendation systems, emotion recognition, medical assistants, and more. This survey reviews recent advancements in PLLMs from three technical perspectives: prompting for personalized context (input level), finetuning for personalized adapters (model level), and alignment for personalized preferences (objective level). To provide deeper insights, we also discuss current limitations and outline several promising directions for future research. Updated information about this survey can be found at the https://github.com/JiahongLiu21/Awesome-Personalized-Large-Language-Models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11528v1-abstract-full').style.display = 'none'; document.getElementById('2502.11528v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7pages, 5 figures, Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08244">arXiv:2502.08244</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.08244">pdf</a>, <a href="https://arxiv.org/format/2502.08244">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FloVD: Optical Flow Meets Video Diffusion Model for Enhanced Camera-Controlled Video Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jin%2C+W">Wonjoon Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Baek%2C+S">Seung-Hwan Baek</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+S">Sunghyun Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08244v2-abstract-short" style="display: inline;"> We present FloVD, a novel video diffusion model for camera-controllable video generation. FloVD leverages optical flow to represent the motions of the camera and moving objects. This approach offers two key benefits. Since optical flow can be directly estimated from videos, our approach allows for the use of arbitrary training videos without ground-truth camera parameters. Moreover, as background&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08244v2-abstract-full').style.display = 'inline'; document.getElementById('2502.08244v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08244v2-abstract-full" style="display: none;"> We present FloVD, a novel video diffusion model for camera-controllable video generation. FloVD leverages optical flow to represent the motions of the camera and moving objects. This approach offers two key benefits. Since optical flow can be directly estimated from videos, our approach allows for the use of arbitrary training videos without ground-truth camera parameters. Moreover, as background optical flow encodes 3D correlation across different viewpoints, our method enables detailed camera control by leveraging the background motion. To synthesize natural object motion while supporting detailed camera control, our framework adopts a two-stage video synthesis pipeline consisting of optical flow generation and flow-conditioned video synthesis. Extensive experiments demonstrate the superiority of our method over previous approaches in terms of accurate camera control and natural object motion synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08244v2-abstract-full').style.display = 'none'; document.getElementById('2502.08244v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our paper has been accepted to CVPR 2025. Website: https://jinwonjoon.github.io/flovd_site/ Code: https://github.com/JinWonjoon/FloVD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04194">arXiv:2502.04194</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.04194">pdf</a>, <a href="https://arxiv.org/format/2502.04194">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Best Instruction-Tuning Data are Those That Fit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dylan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qirun Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+H">Hao Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04194v2-abstract-short" style="display: inline;"> High-quality supervised fine-tuning (SFT) data are crucial for eliciting strong capabilities from pretrained large language models (LLMs). Typically, instructions are paired with multiple responses sampled from other LLMs, which are often out of the distribution of the target model to be fine-tuned. This, at scale, can lead to diminishing returns and even hurt the models&#39; performance and robustnes&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04194v2-abstract-full').style.display = 'inline'; document.getElementById('2502.04194v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04194v2-abstract-full" style="display: none;"> High-quality supervised fine-tuning (SFT) data are crucial for eliciting strong capabilities from pretrained large language models (LLMs). Typically, instructions are paired with multiple responses sampled from other LLMs, which are often out of the distribution of the target model to be fine-tuned. This, at scale, can lead to diminishing returns and even hurt the models&#39; performance and robustness. We propose **GRAPE**, a novel SFT framework that accounts for the unique characteristics of the target model. For each instruction, it gathers responses from various LLMs and selects the one with the highest probability measured by the target model, indicating that it aligns most closely with the target model&#39;s pretrained distribution; it then proceeds with standard SFT training. We first evaluate GRAPE with a controlled experiment, where we sample various solutions for each question in UltraInteract from multiple models and fine-tune commonly used LMs like LLaMA3.1-8B, Mistral-7B, and Qwen2.5-7B on GRAPE-selected data. GRAPE significantly outperforms strong baselines, including distilling from the strongest model with an absolute gain of up to 13.8%, averaged across benchmarks, and training on 3x more data with a maximum performance improvement of 17.3%. GRAPE&#39;s strong performance generalizes to realistic settings. We experiment with the post-training data used for Tulu3 and Olmo-2. GRAPE outperforms strong baselines trained on 4.5 times more data by 6.1% and a state-of-the-art data selection approach by 3% on average performance. Remarkably, using 1/3 of the data and half the number of epochs, GRAPE enables LLaMA3.1-8B to surpass the performance of Tulu3-SFT by 3.5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04194v2-abstract-full').style.display = 'none'; document.getElementById('2502.04194v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12147">arXiv:2501.12147</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.12147">pdf</a>, <a href="https://arxiv.org/format/2501.12147">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving Influence-based Instruction Tuning Data Selection for Balanced Learning of Diverse Capabilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qirun Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dylan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J+W">Jiaqi W. Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+H">Hao Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12147v1-abstract-short" style="display: inline;"> Selecting appropriate training data is crucial for effective instruction fine-tuning of large language models (LLMs), which aims to (1) elicit strong capabilities, and (2) achieve balanced performance across a diverse range of tasks. Influence-based methods show promise in achieving (1) by estimating the contribution of each training example to the model&#39;s predictions, but often struggle with (2).&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12147v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12147v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12147v1-abstract-full" style="display: none;"> Selecting appropriate training data is crucial for effective instruction fine-tuning of large language models (LLMs), which aims to (1) elicit strong capabilities, and (2) achieve balanced performance across a diverse range of tasks. Influence-based methods show promise in achieving (1) by estimating the contribution of each training example to the model&#39;s predictions, but often struggle with (2). Our systematic investigation reveals that this underperformance can be attributed to an inherent bias where certain tasks intrinsically have greater influence than others. As a result, data selection is often biased towards these tasks, not only hurting the model&#39;s performance on others but also, counterintuitively, harms performance on these high-influence tasks themselves. As a remedy, we propose BIDS, a Balanced and Influential Data Selection algorithm. BIDS first normalizes influence scores of the training data, and then iteratively balances data selection by choosing the training example with the highest influence on the most underrepresented task. Experiments with both Llama-3 and Mistral-v0.3 on seven benchmarks spanning five diverse capabilities show that BIDS consistently outperforms both state-of-the-art influence-based algorithms and other non-influence-based selection frameworks. Surprisingly, training on a 15% subset selected by BIDS can even outperform full-dataset training with a much more balanced performance. Our analysis further highlights the importance of both instance-level normalization and iterative optimization of selected data for balanced learning of diverse capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12147v1-abstract-full').style.display = 'none'; document.getElementById('2501.12147v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09493">arXiv:2501.09493</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.09493">pdf</a>, <a href="https://arxiv.org/format/2501.09493">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models as Evaluators for Conversational Recommender Systems: Benchmarking System Performance from a User-Centric Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+N">Nuo Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+X">Xiaoyu Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiao-Ming Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zhenhua Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09493v2-abstract-short" style="display: inline;"> Conversational recommender systems (CRS) involve both recommendation and dialogue tasks, which makes their evaluation a unique challenge. Although past research has analyzed various factors that may affect user satisfaction with CRS interactions from the perspective of user studies, few evaluation metrics for CRS have been proposed. Recent studies have shown that LLMs can align with human preferen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09493v2-abstract-full').style.display = 'inline'; document.getElementById('2501.09493v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09493v2-abstract-full" style="display: none;"> Conversational recommender systems (CRS) involve both recommendation and dialogue tasks, which makes their evaluation a unique challenge. Although past research has analyzed various factors that may affect user satisfaction with CRS interactions from the perspective of user studies, few evaluation metrics for CRS have been proposed. Recent studies have shown that LLMs can align with human preferences, and several LLM-based text quality evaluation measures have been introduced. However, the application of LLMs in CRS evaluation remains relatively limited. To address this research gap and advance the development of user-centric conversational recommender systems, this study proposes an automated LLM-based CRS evaluation framework, building upon existing research in human-computer interaction and psychology. The framework evaluates CRS from four dimensions: dialogue behavior, language expression, recommendation items, and response content. We use this framework to evaluate four different conversational recommender systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09493v2-abstract-full').style.display = 'none'; document.getElementById('2501.09493v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07917">arXiv:2501.07917</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.07917">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Roadmap on Neuromorphic Photonics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Brunner%2C+D">Daniel Brunner</a>, <a href="/search/cs?searchtype=author&amp;query=Shastri%2C+B+J">Bhavin J. Shastri</a>, <a href="/search/cs?searchtype=author&amp;query=Qadasi%2C+M+A+A">Mohammed A. Al Qadasi</a>, <a href="/search/cs?searchtype=author&amp;query=Ballani%2C+H">H. Ballani</a>, <a href="/search/cs?searchtype=author&amp;query=Barbay%2C+S">Sylvain Barbay</a>, <a href="/search/cs?searchtype=author&amp;query=Biasi%2C+S">Stefano Biasi</a>, <a href="/search/cs?searchtype=author&amp;query=Bienstman%2C+P">Peter Bienstman</a>, <a href="/search/cs?searchtype=author&amp;query=Bilodeau%2C+S">Simon Bilodeau</a>, <a href="/search/cs?searchtype=author&amp;query=Bogaerts%2C+W">Wim Bogaerts</a>, <a href="/search/cs?searchtype=author&amp;query=B%C3%B6hm%2C+F">Fabian B枚hm</a>, <a href="/search/cs?searchtype=author&amp;query=Brennan%2C+G">G. Brennan</a>, <a href="/search/cs?searchtype=author&amp;query=Buckley%2C+S">Sonia Buckley</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+X">Xinlun Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Strinati%2C+M+C">Marcello Calvanese Strinati</a>, <a href="/search/cs?searchtype=author&amp;query=Canakci%2C+B">B. Canakci</a>, <a href="/search/cs?searchtype=author&amp;query=Charbonnier%2C+B">Benoit Charbonnier</a>, <a href="/search/cs?searchtype=author&amp;query=Chemnitz%2C+M">Mario Chemnitz</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yitong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Cheung%2C+S">Stanley Cheung</a>, <a href="/search/cs?searchtype=author&amp;query=Chiles%2C+J">Jeff Chiles</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+S">Suyeon Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Christodoulides%2C+D+N">Demetrios N. Christodoulides</a>, <a href="/search/cs?searchtype=author&amp;query=Chrostowski%2C+L">Lukas Chrostowski</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+J">J. Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Clegg%2C+J+H">J. H. Clegg</a> , et al. (125 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07917v2-abstract-short" style="display: inline;"> This roadmap consolidates recent advances while exploring emerging applications, reflecting the remarkable diversity of hardware platforms, neuromorphic concepts, and implementation philosophies reported in the field. It emphasizes the critical role of cross-disciplinary collaboration in this rapidly evolving field. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07917v2-abstract-full" style="display: none;"> This roadmap consolidates recent advances while exploring emerging applications, reflecting the remarkable diversity of hardware platforms, neuromorphic concepts, and implementation philosophies reported in the field. It emphasizes the critical role of cross-disciplinary collaboration in this rapidly evolving field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07917v2-abstract-full').style.display = 'none'; document.getElementById('2501.07917v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03689">arXiv:2501.03689</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.03689">pdf</a>, <a href="https://arxiv.org/format/2501.03689">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MAJL: A Model-Agnostic Joint Learning Framework for Music Source Separation and Pitch Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wei%2C+H">Haojie Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+J">Jun Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yueguo Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03689v1-abstract-short" style="display: inline;"> Music source separation and pitch estimation are two vital tasks in music information retrieval. Typically, the input of pitch estimation is obtained from the output of music source separation. Therefore, existing methods have tried to perform these two tasks simultaneously, so as to leverage the mutually beneficial relationship between both tasks. However, these methods still face two critical ch&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03689v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03689v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03689v1-abstract-full" style="display: none;"> Music source separation and pitch estimation are two vital tasks in music information retrieval. Typically, the input of pitch estimation is obtained from the output of music source separation. Therefore, existing methods have tried to perform these two tasks simultaneously, so as to leverage the mutually beneficial relationship between both tasks. However, these methods still face two critical challenges that limit the improvement of both tasks: the lack of labeled data and joint learning optimization. To address these challenges, we propose a Model-Agnostic Joint Learning (MAJL) framework for both tasks. MAJL is a generic framework and can use variant models for each task. It includes a two-stage training method and a dynamic weighting method named Dynamic Weights on Hard Samples (DWHS), which addresses the lack of labeled data and joint learning optimization, respectively. Experimental results on public music datasets show that MAJL outperforms state-of-the-art methods on both tasks, with significant improvements of 0.92 in Signal-to-Distortion Ratio (SDR) for music source separation and 2.71% in Raw Pitch Accuracy (RPA) for pitch estimation. Furthermore, comprehensive studies not only validate the effectiveness of each component of MAJL, but also indicate the great generality of MAJL in adapting to different model architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03689v1-abstract-full').style.display = 'none'; document.getElementById('2501.03689v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20083">arXiv:2412.20083</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.20083">pdf</a>, <a href="https://arxiv.org/format/2412.20083">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Achieving Full-Bandwidth Sensing Performance with Partial Bandwidth Allocation for ISAC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Z">Zhiqiang Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Zhiwen Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qianglong Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Y">Yong Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+F">Fei Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20083v1-abstract-short" style="display: inline;"> This letter studies an uplink integrated sensing and communication (ISAC) system using discrete Fourier transform spread orthogonal frequency division multiplexing (DFT-s-OFDM) transmission. We try to answer the following fundamental question: With only a fractional bandwidth allocated to the user with sensing task, can the same delay resolution and unambiguous range be achieved as if all bandwidt&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20083v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20083v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20083v1-abstract-full" style="display: none;"> This letter studies an uplink integrated sensing and communication (ISAC) system using discrete Fourier transform spread orthogonal frequency division multiplexing (DFT-s-OFDM) transmission. We try to answer the following fundamental question: With only a fractional bandwidth allocated to the user with sensing task, can the same delay resolution and unambiguous range be achieved as if all bandwidth were allocated to it? We affirmatively answer the question by proposing a novel two-stage delay estimation (TSDE) method that exploits the following facts: without increasing the allocated bandwidth, higher delay resolution can be achieved via distributed subcarrier allocation compared to its collocated counterpart, while there is a trade-off between delay resolution and unambiguous range by varying the decimation factor of subcarriers. Therefore, the key idea of the proposed TSDE method is to first perform coarse delay estimation with collocated subcarriers to achieve a large unambiguous range, and then use distributed subcarriers with optimized decimation factor to enhance delay resolution while avoiding delay ambiguity. Our analysis shows that the proposed TSDE method can achieve the full-bandwidth delay resolution and unambiguous range, by using only at most half of the full bandwidth, provided that the channel delay spread is less than half of the unambiguous range. Numerical results show the superiority of the proposed method over the conventional method with collocated subcarriers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20083v1-abstract-full').style.display = 'none'; document.getElementById('2412.20083v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10680">arXiv:2412.10680</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.10680">pdf</a>, <a href="https://arxiv.org/format/2412.10680">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> UCDR-Adapter: Exploring Adaptation of Pre-Trained Vision-Language Models for Universal Cross-Domain Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Haoyu Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Z">Zhi-Qi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Moreira%2C+G">Gabriel Moreira</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jiawen Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+J">Jingdong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+B">Bukun Ren</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jun-Yan He</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Hua%2C+X">Xian-Sheng Hua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10680v1-abstract-short" style="display: inline;"> Universal Cross-Domain Retrieval (UCDR) retrieves relevant images from unseen domains and classes without semantic labels, ensuring robust generalization. Existing methods commonly employ prompt tuning with pre-trained vision-language models but are inherently limited by static prompts, reducing adaptability. We propose UCDR-Adapter, which enhances pre-trained models with adapters and dynamic prom&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10680v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10680v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10680v1-abstract-full" style="display: none;"> Universal Cross-Domain Retrieval (UCDR) retrieves relevant images from unseen domains and classes without semantic labels, ensuring robust generalization. Existing methods commonly employ prompt tuning with pre-trained vision-language models but are inherently limited by static prompts, reducing adaptability. We propose UCDR-Adapter, which enhances pre-trained models with adapters and dynamic prompt generation through a two-phase training strategy. First, Source Adapter Learning integrates class semantics with domain-specific visual knowledge using a Learnable Textual Semantic Template and optimizes Class and Domain Prompts via momentum updates and dual loss functions for robust alignment. Second, Target Prompt Generation creates dynamic prompts by attending to masked source prompts, enabling seamless adaptation to unseen domains and classes. Unlike prior approaches, UCDR-Adapter dynamically adapts to evolving data distributions, enhancing both flexibility and generalization. During inference, only the image branch and generated prompts are used, eliminating reliance on textual inputs for highly efficient retrieval. Extensive benchmark experiments show that UCDR-Adapter consistently outperforms ProS in most cases and other state-of-the-art methods on UCDR, U(c)CDR, and U(d)CDR settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10680v1-abstract-full').style.display = 'none'; document.getElementById('2412.10680v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WACV 2025. Project link: https://github.com/fine68/UCDR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05908">arXiv:2412.05908</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.05908">pdf</a>, <a href="https://arxiv.org/format/2412.05908">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GBR: Generative Bundle Refinement for High-fidelity Gaussian Splatting and Meshing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Y">Yuchao Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Ziwei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qionghai Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+X">Xiaoyun Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05908v1-abstract-short" style="display: inline;"> Gaussian splatting has gained attention for its efficient representation and rendering of 3D scenes using continuous Gaussian primitives. However, it struggles with sparse-view inputs due to limited geometric and photometric information, causing ambiguities in depth, shape, and texture. we propose GBR: Generative Bundle Refinement, a method for high-fidelity Gaussian splatting and meshing using&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05908v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05908v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05908v1-abstract-full" style="display: none;"> Gaussian splatting has gained attention for its efficient representation and rendering of 3D scenes using continuous Gaussian primitives. However, it struggles with sparse-view inputs due to limited geometric and photometric information, causing ambiguities in depth, shape, and texture. we propose GBR: Generative Bundle Refinement, a method for high-fidelity Gaussian splatting and meshing using only 4-6 input views. GBR integrates a neural bundle adjustment module to enhance geometry accuracy and a generative depth refinement module to improve geometry fidelity. More specifically, the neural bundle adjustment module integrates a foundation network to produce initial 3D point maps and point matches from unposed images, followed by bundle adjustment optimization to improve multiview consistency and point cloud accuracy. The generative depth refinement module employs a diffusion-based strategy to enhance geometric details and fidelity while preserving the scale. Finally, for Gaussian splatting optimization, we propose a multimodal loss function incorporating depth and normal consistency, geometric regularization, and pseudo-view supervision, providing robust guidance under sparse-view conditions. Experiments on widely used datasets show that GBR significantly outperforms existing methods under sparse-view inputs. Additionally, GBR demonstrates the ability to reconstruct and render large-scale real-world scenes, such as the Pavilion of Prince Teng and the Great Wall, with remarkable details using only 6 views. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05908v1-abstract-full').style.display = 'none'; document.getElementById('2412.05908v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04531">arXiv:2412.04531</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.04531">pdf</a>, <a href="https://arxiv.org/format/2412.04531">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MageBench: Bridging Large Multimodal Models to Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Miaosen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+J">Jianmin Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+D">Dongdong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+K">Kai Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+X">Xin Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+B">Baining Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04531v1-abstract-short" style="display: inline;"> LMMs have shown impressive visual understanding capabilities, with the potential to be applied in agents, which demand strong reasoning and planning abilities. Nevertheless, existing benchmarks mostly assess their reasoning abilities in language part, where the chain-of-thought is entirely composed of text.We consider the scenario where visual signals are continuously updated and required along th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04531v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04531v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04531v1-abstract-full" style="display: none;"> LMMs have shown impressive visual understanding capabilities, with the potential to be applied in agents, which demand strong reasoning and planning abilities. Nevertheless, existing benchmarks mostly assess their reasoning abilities in language part, where the chain-of-thought is entirely composed of text.We consider the scenario where visual signals are continuously updated and required along the decision making process. Such vision-in-the-chain reasoning paradigm is more aligned with the needs of multimodal agents, while being rarely evaluated. In this paper, we introduce MageBench, a reasoning capability oriented multimodal agent benchmark that, while having light-weight environments, poses significant reasoning challenges and holds substantial practical value. This benchmark currently includes three types of environments: WebUI, Sokoban, and Football, comprising a total of 483 different scenarios. It thoroughly validates the agent&#39;s knowledge and engineering capabilities, visual intelligence, and interaction skills. The results show that only a few product-level models are better than random acting, and all of them are far inferior to human-level. More specifically, we found current models severely lack the ability to modify their planning based on visual feedback, as well as visual imagination, interleaved image-text long context handling, and other abilities. We hope that our work will provide optimization directions for LMM from the perspective of being an agent. We release our code and data at https://github.com/microsoft/MageBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04531v1-abstract-full').style.display = 'none'; document.getElementById('2412.04531v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">37 pages, 32 figures, github link: https://github.com/microsoft/MageBench</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17697">arXiv:2411.17697</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17697">pdf</a>, <a href="https://arxiv.org/format/2411.17697">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StableAnimator: High-Quality Identity-Preserving Human Image Animation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tu%2C+S">Shuyuan Tu</a>, <a href="/search/cs?searchtype=author&amp;query=Xing%2C+Z">Zhen Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xintong Han</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Z">Zhi-Qi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zuxuan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17697v2-abstract-short" style="display: inline;"> Current diffusion models for human image animation struggle to ensure identity (ID) consistency. This paper presents StableAnimator, the first end-to-end ID-preserving video diffusion framework, which synthesizes high-quality videos without any post-processing, conditioned on a reference image and a sequence of poses. Building upon a video diffusion model, StableAnimator contains carefully designe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17697v2-abstract-full').style.display = 'inline'; document.getElementById('2411.17697v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17697v2-abstract-full" style="display: none;"> Current diffusion models for human image animation struggle to ensure identity (ID) consistency. This paper presents StableAnimator, the first end-to-end ID-preserving video diffusion framework, which synthesizes high-quality videos without any post-processing, conditioned on a reference image and a sequence of poses. Building upon a video diffusion model, StableAnimator contains carefully designed modules for both training and inference striving for identity consistency. In particular, StableAnimator begins by computing image and face embeddings with off-the-shelf extractors, respectively and face embeddings are further refined by interacting with image embeddings using a global content-aware Face Encoder. Then, StableAnimator introduces a novel distribution-aware ID Adapter that prevents interference caused by temporal layers while preserving ID via alignment. During inference, we propose a novel Hamilton-Jacobi-Bellman (HJB) equation-based optimization to further enhance the face quality. We demonstrate that solving the HJB equation can be integrated into the diffusion denoising process, and the resulting solution constrains the denoising path and thus benefits ID preservation. Experiments on multiple benchmarks show the effectiveness of StableAnimator both qualitatively and quantitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17697v2-abstract-full').style.display = 'none'; document.getElementById('2411.17697v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13552">arXiv:2411.13552</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13552">pdf</a>, <a href="https://arxiv.org/format/2411.13552">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> REDUCIO! Generating 1024$\times$1024 Video within 16 Seconds using Extremely Compressed Motion Latents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tian%2C+R">Rui Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+J">Jianmin Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+K">Kai Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13552v2-abstract-short" style="display: inline;"> Commercial video generation models have exhibited realistic, high-fidelity results but are still restricted to limited access. One crucial obstacle for large-scale applications is the expensive training and inference cost. In this paper, we argue that videos contain much more redundant information than images, thus can be encoded by very few motion latents based on a content image. Towards this go&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13552v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13552v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13552v2-abstract-full" style="display: none;"> Commercial video generation models have exhibited realistic, high-fidelity results but are still restricted to limited access. One crucial obstacle for large-scale applications is the expensive training and inference cost. In this paper, we argue that videos contain much more redundant information than images, thus can be encoded by very few motion latents based on a content image. Towards this goal, we design an image-conditioned VAE to encode a video to an extremely compressed motion latent space. This magic Reducio charm enables 64x reduction of latents compared to a common 2D VAE, without sacrificing the quality. Training diffusion models on such a compact representation easily allows for generating 1K resolution videos. We then adopt a two-stage video generation paradigm, which performs text-to-image and text-image-to-video sequentially. Extensive experiments show that our Reducio-DiT achieves strong performance in evaluation, though trained with limited GPU resources. More importantly, our method significantly boost the efficiency of video LDMs both in training and inference. We train Reducio-DiT in around 3.2K training hours in total and generate a 16-frame 1024*1024 video clip within 15.5 seconds on a single A100 GPU. Code released at https://github.com/microsoft/Reducio-VAE . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13552v2-abstract-full').style.display = 'none'; document.getElementById('2411.13552v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code available at https://github.com/microsoft/Reducio-VAE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04997">arXiv:2411.04997</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04997">pdf</a>, <a href="https://arxiv.org/format/2411.04997">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM2CLIP: Powerful Language Model Unlocks Richer Visual Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Weiquan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+A">Aoqi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+X">Xufang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yuqing Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+L">Liang Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+X">Xiyang Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+D">Dongdong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+L">Lili Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04997v3-abstract-short" style="display: inline;"> CLIP is a foundational multimodal model that aligns image and text features into a shared space using contrastive learning on large-scale image-text pairs. Its strength lies in leveraging natural language as a rich supervisory signal. With the rapid progress of large language models (LLMs), we explore their potential to further enhance CLIP&#39;s multimodal representation learning. This work introduce&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04997v3-abstract-full').style.display = 'inline'; document.getElementById('2411.04997v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04997v3-abstract-full" style="display: none;"> CLIP is a foundational multimodal model that aligns image and text features into a shared space using contrastive learning on large-scale image-text pairs. Its strength lies in leveraging natural language as a rich supervisory signal. With the rapid progress of large language models (LLMs), we explore their potential to further enhance CLIP&#39;s multimodal representation learning. This work introduces a fine-tuning approach that integrates LLMs with the pretrained CLIP visual encoder, leveraging LLMs&#39; advanced text understanding and open-world knowledge to improve CLIP&#39;s ability to process long and complex captions. To address the challenge of LLMs&#39; autoregressive nature, we propose a caption-to-caption contrastive learning framework to enhance the discriminative power of their outputs. Our method achieves substantial performance gains on various downstream tasks, demonstrating the effectiveness of combining LLMs with CLIP for enhanced multimodal learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04997v3-abstract-full').style.display = 'none'; document.getElementById('2411.04997v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04526">arXiv:2410.04526</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04526">pdf</a>, <a href="https://arxiv.org/format/2410.04526">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FAMMA: A Benchmark for Financial Domain Multilingual Multimodal Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xue%2C+S">Siqiao Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tingting Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+F">Fan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qingyang Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+Z">Zhixuan Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+H">Hongyuan Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04526v2-abstract-short" style="display: inline;"> In this paper, we introduce FAMMA, an open-source benchmark for financial multilingual multimodal question answering (QA). Our benchmark aims to evaluate the abilities of multimodal large language models (MLLMs) in answering questions that require advanced financial knowledge and sophisticated reasoning. It includes 1,758 meticulously collected question-answer pairs from university textbooks and e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04526v2-abstract-full').style.display = 'inline'; document.getElementById('2410.04526v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04526v2-abstract-full" style="display: none;"> In this paper, we introduce FAMMA, an open-source benchmark for financial multilingual multimodal question answering (QA). Our benchmark aims to evaluate the abilities of multimodal large language models (MLLMs) in answering questions that require advanced financial knowledge and sophisticated reasoning. It includes 1,758 meticulously collected question-answer pairs from university textbooks and exams, spanning 8 major subfields in finance including corporate finance, asset management, and financial engineering. Some of the QA pairs are written in Chinese or French, while a majority of them are in English. These questions are presented in a mixed format combining text and heterogeneous image types, such as charts, tables, and diagrams. We evaluate a range of state-of-the-art MLLMs on our benchmark, and our analysis shows that FAMMA poses a significant challenge for these models. Even advanced systems like GPT-4o and Claude-35-Sonnet achieve only 42\% accuracy. Additionally, the open-source Qwen2-VL lags notably behind its proprietary counterparts. Lastly, we explore GPT o1-style reasoning chains to enhance the models&#39; reasoning capabilities, which significantly improve error correction. Our FAMMA benchmark will facilitate future research to develop expert systems in financial QA. The leaderboard is available at https://famma-bench.github.io/famma/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04526v2-abstract-full').style.display = 'none'; document.getElementById('2410.04526v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20163">arXiv:2409.20163</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.20163">pdf</a>, <a href="https://arxiv.org/format/2409.20163">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MemSim: A Bayesian Simulator for Evaluating Memory of LLM-based Personal Assistants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Luyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zeren Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jieming Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">Yi Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zhenhua Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20163v1-abstract-short" style="display: inline;"> LLM-based agents have been widely applied as personal assistants, capable of memorizing information from user messages and responding to personal queries. However, there still lacks an objective and automatic evaluation on their memory capability, largely due to the challenges in constructing reliable questions and answers (QAs) according to user messages. In this paper, we propose MemSim, a Bayes&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20163v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20163v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20163v1-abstract-full" style="display: none;"> LLM-based agents have been widely applied as personal assistants, capable of memorizing information from user messages and responding to personal queries. However, there still lacks an objective and automatic evaluation on their memory capability, largely due to the challenges in constructing reliable questions and answers (QAs) according to user messages. In this paper, we propose MemSim, a Bayesian simulator designed to automatically construct reliable QAs from generated user messages, simultaneously keeping their diversity and scalability. Specifically, we introduce the Bayesian Relation Network (BRNet) and a causal generation mechanism to mitigate the impact of LLM hallucinations on factual information, facilitating the automatic creation of an evaluation dataset. Based on MemSim, we generate a dataset in the daily-life scenario, named MemDaily, and conduct extensive experiments to assess the effectiveness of our approach. We also provide a benchmark for evaluating different memory mechanisms in LLM-based agents with the MemDaily dataset. To benefit the research community, we have released our project at https://github.com/nuster1128/MemSim. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20163v1-abstract-full').style.display = 'none'; document.getElementById('2409.20163v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 25 tables, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19835">arXiv:2409.19835</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.19835">pdf</a>, <a href="https://arxiv.org/format/2409.19835">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MoCoLSK: Modality Conditioned High-Resolution Downscaling for Land Surface Temperature </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qun Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+C">Chunyang Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Y">Yimian Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuxuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+K">Kang Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jianhui Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Shu%2C+X">Xiangbo Shu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19835v2-abstract-short" style="display: inline;"> Land Surface Temperature (LST) is a critical parameter for environmental studies, but directly obtaining high spatial resolution LST data remains challenging due to the spatio-temporal trade-off in satellite remote sensing. Guided LST downscaling has emerged as an alternative solution to overcome these limitations, but current methods often neglect spatial non-stationarity, and there is a lack of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19835v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19835v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19835v2-abstract-full" style="display: none;"> Land Surface Temperature (LST) is a critical parameter for environmental studies, but directly obtaining high spatial resolution LST data remains challenging due to the spatio-temporal trade-off in satellite remote sensing. Guided LST downscaling has emerged as an alternative solution to overcome these limitations, but current methods often neglect spatial non-stationarity, and there is a lack of an open-source ecosystem for deep learning methods. In this paper, we propose the Modality-Conditional Large Selective Kernel (MoCoLSK) Network, a novel architecture that dynamically fuses multi-modal data through modality-conditioned projections. MoCoLSK achieves a confluence of dynamic receptive field adjustment and multi-modal feature fusion, leading to enhanced LST prediction accuracy. Furthermore, we establish the GrokLST project, a comprehensive open-source ecosystem featuring the GrokLST dataset, a high-resolution benchmark, and the GrokLST toolkit, an open-source PyTorch-based toolkit encapsulating MoCoLSK alongside 40+ state-of-the-art approaches. Extensive experimental results validate MoCoLSK&#39;s effectiveness in capturing complex dependencies and subtle variations within multispectral data, outperforming existing methods in LST downscaling. Our code, dataset, and toolkit are available at https://github.com/GrokCV/GrokLST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19835v2-abstract-full').style.display = 'none'; document.getElementById('2409.19835v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE TGRS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05448">arXiv:2409.05448</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.05448">pdf</a>, <a href="https://arxiv.org/format/2409.05448">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Representational Analysis of Binding in Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qin Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Heinzerling%2C+B">Benjamin Heinzerling</a>, <a href="/search/cs?searchtype=author&amp;query=Inui%2C+K">Kentaro Inui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05448v3-abstract-short" style="display: inline;"> Entity tracking is essential for complex reasoning. To perform in-context entity tracking, language models (LMs) must bind an entity to its attribute (e.g., bind a container to its content) to recall attribute for a given entity. For example, given a context mentioning ``The coffee is in Box Z, the stone is in Box M, the map is in Box H&#39;&#39;, to infer ``Box Z contains the coffee&#39;&#39; later, LMs must bin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05448v3-abstract-full').style.display = 'inline'; document.getElementById('2409.05448v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05448v3-abstract-full" style="display: none;"> Entity tracking is essential for complex reasoning. To perform in-context entity tracking, language models (LMs) must bind an entity to its attribute (e.g., bind a container to its content) to recall attribute for a given entity. For example, given a context mentioning ``The coffee is in Box Z, the stone is in Box M, the map is in Box H&#39;&#39;, to infer ``Box Z contains the coffee&#39;&#39; later, LMs must bind ``Box Z&#39;&#39; to ``coffee&#39;&#39;. To explain the binding behaviour of LMs, existing research introduces a Binding ID mechanism and states that LMs use a abstract concept called Binding ID (BI) to internally mark entity-attribute pairs. However, they have not captured the Ordering ID (OI) from entity activations that directly determines the binding behaviour. In this work, we provide a novel view of the BI mechanism by localizing OI and proving the causality between OI and binding behaviour. Specifically, by leveraging dimension reduction methods (e.g., PCA), we discover that there exists a low-rank subspace in the activations of LMs, that primarily encodes the order (i.e., OI) of entity and attribute. Moreover, we also discover the causal effect of OI on binding that when editing representations along the OI encoding direction, LMs tend to bind a given entity to other attributes accordingly. For example, by patching activations along the OI encoding direction we can make the LM to infer ``Box Z contains the stone&#39;&#39; and ``Box Z contains the map&#39;&#39;. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05448v3-abstract-full').style.display = 'none'; document.getElementById('2409.05448v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20078">arXiv:2407.20078</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.20078">pdf</a>, <a href="https://arxiv.org/format/2407.20078">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Background Semantics Matter: Cross-Task Feature Exchange Network for Clustered Infrared Small Target Detection With Sky-Annotated Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+M">Mengxuan Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qun Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yiming Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+K">Kehua Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shu%2C+X">Xiangbo Shu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Y">Yimian Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20078v2-abstract-short" style="display: inline;"> Infrared small target detection poses unique challenges due to the scarcity of intrinsic target features and the abundance of similar background distractors. We argue that background semantics play a pivotal role in distinguishing visually similar objects for this task. To address this, we introduce a new task--clustered infrared small target detection, and present DenseSIRST, a novel benchmark da&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20078v2-abstract-full').style.display = 'inline'; document.getElementById('2407.20078v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20078v2-abstract-full" style="display: none;"> Infrared small target detection poses unique challenges due to the scarcity of intrinsic target features and the abundance of similar background distractors. We argue that background semantics play a pivotal role in distinguishing visually similar objects for this task. To address this, we introduce a new task--clustered infrared small target detection, and present DenseSIRST, a novel benchmark dataset that provides per-pixel semantic annotations for background regions, enabling the transition from sparse to dense target detection. Leveraging this dataset, we propose the Background-Aware Feature Exchange Network (BAFE-Net), which transforms the detection paradigm from a single task focused on the foreground to a multi-task architecture that jointly performs target detection and background semantic segmentation. BAFE-Net introduces a dynamic cross-task feature hard-exchange mechanism to embed target and background semantics between the two tasks. Furthermore, we propose the Background-Aware Gaussian Copy-Paste (BAG-CP) method, which selectively pastes small targets into sky regions during training, avoiding the creation of false alarm targets in complex non-sky backgrounds. Extensive experiments validate the effectiveness of BAG-CP and BAFE-Net in improving target detection accuracy while reducing false alarms. The DenseSIRST dataset, code, and trained models are available at https://github.com/GrokCV/BAFE-Net. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20078v2-abstract-full').style.display = 'none'; document.getElementById('2407.20078v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08209">arXiv:2407.08209</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.08209">pdf</a>, <a href="https://arxiv.org/format/2407.08209">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enriching Information and Preserving Semantic Consistency in Expanding Curvilinear Object Segmentation Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lei%2C+Q">Qin Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+J">Jiang Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qizhu Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08209v1-abstract-short" style="display: inline;"> Curvilinear object segmentation plays a crucial role across various applications, yet datasets in this domain often suffer from small scale due to the high costs associated with data acquisition and annotation. To address these challenges, this paper introduces a novel approach for expanding curvilinear object segmentation datasets, focusing on enhancing the informativeness of generated data and t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08209v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08209v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08209v1-abstract-full" style="display: none;"> Curvilinear object segmentation plays a crucial role across various applications, yet datasets in this domain often suffer from small scale due to the high costs associated with data acquisition and annotation. To address these challenges, this paper introduces a novel approach for expanding curvilinear object segmentation datasets, focusing on enhancing the informativeness of generated data and the consistency between semantic maps and generated images. Our method enriches synthetic data informativeness by generating curvilinear objects through their multiple textual features. By combining textual features from each sample in original dataset, we obtain synthetic images that beyond the original dataset&#39;s distribution. This initiative necessitated the creation of the Curvilinear Object Segmentation based on Text Generation (COSTG) dataset. Designed to surpass the limitations of conventional datasets, COSTG incorporates not only standard semantic maps but also some textual descriptions of curvilinear object features. To ensure consistency between synthetic semantic maps and images, we introduce the Semantic Consistency Preserving ControlNet (SCP ControlNet). This involves an adaptation of ControlNet with Spatially-Adaptive Normalization (SPADE), allowing it to preserve semantic information that would typically be washed away in normalization layers. This modification facilitates more accurate semantic image synthesis. Experimental results demonstrate the efficacy of our approach across three types of curvilinear objects (angiography, crack and retina) and six public datasets (CHUAC, XCAD, DCA1, DRIVE, CHASEDB1 and Crack500). The synthetic data generated by our method not only expand the dataset, but also effectively improves the performance of other curvilinear object segmentation models. Source code and dataset are available at \url{https://github.com/tanlei0/COSTG}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08209v1-abstract-full').style.display = 'none'; document.getElementById('2407.08209v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19236">arXiv:2406.19236</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.19236">pdf</a>, <a href="https://arxiv.org/format/2406.19236">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Human-Aware Vision-and-Language Navigation: Bridging Simulation to Reality with Dynamic Human Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Heng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Minghan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Z">Zhi-Qi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yifei Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yuxuan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jun-Yan He</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Mitamura%2C+T">Teruko Mitamura</a>, <a href="/search/cs?searchtype=author&amp;query=Hauptmann%2C+A+G">Alexander G. Hauptmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19236v3-abstract-short" style="display: inline;"> Vision-and-Language Navigation (VLN) aims to develop embodied agents that navigate based on human instructions. However, current VLN frameworks often rely on static environments and optimal expert supervision, limiting their real-world applicability. To address this, we introduce Human-Aware Vision-and-Language Navigation (HA-VLN), extending traditional VLN by incorporating dynamic human activitie&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19236v3-abstract-full').style.display = 'inline'; document.getElementById('2406.19236v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19236v3-abstract-full" style="display: none;"> Vision-and-Language Navigation (VLN) aims to develop embodied agents that navigate based on human instructions. However, current VLN frameworks often rely on static environments and optimal expert supervision, limiting their real-world applicability. To address this, we introduce Human-Aware Vision-and-Language Navigation (HA-VLN), extending traditional VLN by incorporating dynamic human activities and relaxing key assumptions. We propose the Human-Aware 3D (HA3D) simulator, which combines dynamic human activities with the Matterport3D dataset, and the Human-Aware Room-to-Room (HA-R2R) dataset, extending R2R with human activity descriptions. To tackle HA-VLN challenges, we present the Expert-Supervised Cross-Modal (VLN-CM) and Non-Expert-Supervised Decision Transformer (VLN-DT) agents, utilizing cross-modal fusion and diverse training strategies for effective navigation in dynamic human environments. A comprehensive evaluation, including metrics considering human activities, and systematic analysis of HA-VLN&#39;s unique challenges, underscores the need for further research to enhance HA-VLN agents&#39; real-world robustness and adaptability. Ultimately, this work provides benchmarks and insights for future research on embodied AI and Sim2Real transfer, paving the way for more realistic and applicable VLN systems in human-populated environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19236v3-abstract-full').style.display = 'none'; document.getElementById('2406.19236v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Spotlight at NeurIPS 2024 D&amp;B Track. 32 pages, 18 figures, Project Page: https://lpercc.github.io/HA3D_simulator/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16268">arXiv:2406.16268</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.16268">pdf</a>, <a href="https://arxiv.org/format/2406.16268">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Efficient Antagonistic k-plex Enumeration in Signed Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+L">Lantian Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Rong-Hua Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+D">Dong Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qiangqiang Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guoren Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+L">Lu Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16268v1-abstract-short" style="display: inline;"> A signed graph is a graph where each edge receives a sign, positive or negative. The signed graph model has been used in many real applications, such as protein complex discovery and social network analysis. Finding cohesive subgraphs in signed graphs is a fundamental problem. A k-plex is a common model for cohesive subgraphs in which every vertex is adjacent to all but at most k vertices within t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16268v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16268v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16268v1-abstract-full" style="display: none;"> A signed graph is a graph where each edge receives a sign, positive or negative. The signed graph model has been used in many real applications, such as protein complex discovery and social network analysis. Finding cohesive subgraphs in signed graphs is a fundamental problem. A k-plex is a common model for cohesive subgraphs in which every vertex is adjacent to all but at most k vertices within the subgraph. In this paper, we propose the model of size-constrained antagonistic k-plex in a signed graph. The proposed model guarantees that the resulting subgraph is a k-plex and can be divided into two sub-k-plexes, both of which have positive inner edges and negative outer edges. This paper aims to identify all maximal antagonistic k-plexes in a signed graph. Through rigorous analysis, we show that the problem is NP-Hardness. We propose a novel framework for maximal antagonistic k-plexes utilizing set enumeration. Efficiency is improved through pivot pruning and early termination based on the color bound. Preprocessing techniques based on degree and dichromatic graphs effectively narrow the search space before enumeration. Extensive experiments on real-world datasets demonstrate our algorithm&#39;s efficiency, effectiveness, and scalability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16268v1-abstract-full').style.display = 'none'; document.getElementById('2406.16268v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13300">arXiv:2406.13300</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13300">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LightGBM robust optimization algorithm based on topological data analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Han Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+G">Guangjun Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yongqing Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qinglong Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13300v1-abstract-short" style="display: inline;"> To enhance the robustness of the Light Gradient Boosting Machine (LightGBM) algorithm for image classification, a topological data analysis (TDA)-based robustness optimization algorithm for LightGBM, TDA-LightGBM, is proposed to address the interference of noise on image classification. Initially, the method partitions the feature engineering process into two streams: pixel feature stream and topo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13300v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13300v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13300v1-abstract-full" style="display: none;"> To enhance the robustness of the Light Gradient Boosting Machine (LightGBM) algorithm for image classification, a topological data analysis (TDA)-based robustness optimization algorithm for LightGBM, TDA-LightGBM, is proposed to address the interference of noise on image classification. Initially, the method partitions the feature engineering process into two streams: pixel feature stream and topological feature stream for feature extraction respectively. Subsequently, these pixel and topological features are amalgamated into a comprehensive feature vector, serving as the input for LightGBM in image classification tasks. This fusion of features not only encompasses traditional feature engineering methodologies but also harnesses topological structure information to more accurately encapsulate the intrinsic features of the image. The objective is to surmount challenges related to unstable feature extraction and diminished classification accuracy induced by data noise in conventional image processing. Experimental findings substantiate that TDA-LightGBM achieves a 3% accuracy improvement over LightGBM on the SOCOFing dataset across five classification tasks under noisy conditions. In noise-free scenarios, TDA-LightGBM exhibits a 0.5% accuracy enhancement over LightGBM on two classification tasks, achieving a remarkable accuracy of 99.8%. Furthermore, the method elevates the classification accuracy of the Ultrasound Breast Images for Breast Cancer dataset and the Masked CASIA WebFace dataset by 6% and 15%, respectively, surpassing LightGBM in the presence of noise. These empirical results underscore the efficacy of the TDA-LightGBM approach in fortifying the robustness of LightGBM by integrating topological features, thereby augmenting the performance of image classification tasks amidst data perturbations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13300v1-abstract-full').style.display = 'none'; document.getElementById('2406.13300v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09397">arXiv:2406.09397</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.09397">pdf</a>, <a href="https://arxiv.org/format/2406.09397">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Aligning Vision Models with Human Aesthetics in Retrieval: Benchmarks and Algorithms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Miaosen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yixuan Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Xing%2C+Z">Zhen Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y">Yifei Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Ji Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+X">Xin Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+B">Baining Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09397v1-abstract-short" style="display: inline;"> Modern vision models are trained on very large noisy datasets. While these models acquire strong capabilities, they may not follow the user&#39;s intent to output the desired results in certain aspects, e.g., visual aesthetic, preferred style, and responsibility. In this paper, we target the realm of visual aesthetics and aim to align vision models with human aesthetic standards in a retrieval system.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09397v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09397v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09397v1-abstract-full" style="display: none;"> Modern vision models are trained on very large noisy datasets. While these models acquire strong capabilities, they may not follow the user&#39;s intent to output the desired results in certain aspects, e.g., visual aesthetic, preferred style, and responsibility. In this paper, we target the realm of visual aesthetics and aim to align vision models with human aesthetic standards in a retrieval system. Advanced retrieval systems usually adopt a cascade of aesthetic models as re-rankers or filters, which are limited to low-level features like saturation and perform poorly when stylistic, cultural or knowledge contexts are involved. We find that utilizing the reasoning ability of large language models (LLMs) to rephrase the search query and extend the aesthetic expectations can make up for this shortcoming. Based on the above findings, we propose a preference-based reinforcement learning method that fine-tunes the vision models to distill the knowledge from both LLMs reasoning and the aesthetic models to better align the vision models with human aesthetics. Meanwhile, with rare benchmarks designed for evaluating retrieval systems, we leverage large multi-modality model (LMM) to evaluate the aesthetic performance with their strong abilities. As aesthetic assessment is one of the most subjective tasks, to validate the robustness of LMM, we further propose a novel dataset named HPIR to benchmark the alignment with human aesthetics. Experiments demonstrate that our method significantly enhances the aesthetic behaviors of the vision models, under several metrics. We believe the proposed algorithm can be a general practice for aligning vision models with human values. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09397v1-abstract-full').style.display = 'none'; document.getElementById('2406.09397v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 26 figures, under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06465">arXiv:2406.06465</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.06465">pdf</a>, <a href="https://arxiv.org/format/2406.06465">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> AID: Adapting Image2Video Diffusion Models for Instruction-guided Video Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xing%2C+Z">Zhen Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Weng%2C+Z">Zejia Weng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06465v1-abstract-short" style="display: inline;"> Text-guided video prediction (TVP) involves predicting the motion of future frames from the initial frame according to an instruction, which has wide applications in virtual reality, robotics, and content creation. Previous TVP methods make significant breakthroughs by adapting Stable Diffusion for this task. However, they struggle with frame consistency and temporal stability primarily due to the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06465v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06465v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06465v1-abstract-full" style="display: none;"> Text-guided video prediction (TVP) involves predicting the motion of future frames from the initial frame according to an instruction, which has wide applications in virtual reality, robotics, and content creation. Previous TVP methods make significant breakthroughs by adapting Stable Diffusion for this task. However, they struggle with frame consistency and temporal stability primarily due to the limited scale of video datasets. We observe that pretrained Image2Video diffusion models possess good priors for video dynamics but they lack textual control. Hence, transferring Image2Video models to leverage their video dynamic priors while injecting instruction control to generate controllable videos is both a meaningful and challenging task. To achieve this, we introduce the Multi-Modal Large Language Model (MLLM) to predict future video states based on initial frames and text instructions. More specifically, we design a dual query transformer (DQFormer) architecture, which integrates the instructions and frames into the conditional embeddings for future frame prediction. Additionally, we develop Long-Short Term Temporal Adapters and Spatial Adapters that can quickly transfer general video diffusion models to specific scenarios with minimal training costs. Experimental results show that our method significantly outperforms state-of-the-art techniques on four datasets: Something Something V2, Epic Kitchen-100, Bridge Data, and UCF-101. Notably, AID achieves 91.2% and 55.5% FVD improvements on Bridge and SSv2 respectively, demonstrating its effectiveness in various domains. More examples can be found at our website https://chenhsing.github.io/AID. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06465v1-abstract-full').style.display = 'none'; document.getElementById('2406.06465v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20325">arXiv:2405.20325</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.20325">pdf</a>, <a href="https://arxiv.org/format/2405.20325">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MotionFollower: Editing Video Motion via Lightweight Score-Guided Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tu%2C+S">Shuyuan Tu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zihao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+S">Sicheng Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Z">Zhi-Qi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xintong Han</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20325v1-abstract-short" style="display: inline;"> Despite impressive advancements in diffusion-based video editing models in altering video attributes, there has been limited exploration into modifying motion information while preserving the original protagonist&#39;s appearance and background. In this paper, we propose MotionFollower, a lightweight score-guided diffusion model for video motion editing. To introduce conditional controls to the denois&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20325v1-abstract-full').style.display = 'inline'; document.getElementById('2405.20325v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20325v1-abstract-full" style="display: none;"> Despite impressive advancements in diffusion-based video editing models in altering video attributes, there has been limited exploration into modifying motion information while preserving the original protagonist&#39;s appearance and background. In this paper, we propose MotionFollower, a lightweight score-guided diffusion model for video motion editing. To introduce conditional controls to the denoising process, MotionFollower leverages two of our proposed lightweight signal controllers, one for poses and the other for appearances, both of which consist of convolution blocks without involving heavy attention calculations. Further, we design a score guidance principle based on a two-branch architecture, including the reconstruction and editing branches, which significantly enhance the modeling capability of texture details and complicated backgrounds. Concretely, we enforce several consistency regularizers and losses during the score estimation. The resulting gradients thus inject appropriate guidance to the intermediate latents, forcing the model to preserve the original background details and protagonists&#39; appearances without interfering with the motion modification. Experiments demonstrate the competitive motion editing ability of MotionFollower qualitatively and quantitatively. Compared with MotionEditor, the most advanced motion editing model, MotionFollower achieves an approximately 80% reduction in GPU memory while delivering superior motion editing performance and exclusively supporting large camera movements and actions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20325v1-abstract-full').style.display = 'none'; document.getElementById('2405.20325v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 18 figures. Project page at https://francis-rings.github.io/MotionFollower/</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45; 68T10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16850">arXiv:2405.16850</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.16850">pdf</a>, <a href="https://arxiv.org/format/2405.16850">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> UniCompress: Enhancing Multi-Data Medical Image Compression with Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+R">Runzhao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yinda Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhihong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiaoyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zongren Li</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+K">Kunlun He</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Z">Zhiwei Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Suo%2C+J">Jinli Suo</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qionghai Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16850v1-abstract-short" style="display: inline;"> In the field of medical image compression, Implicit Neural Representation (INR) networks have shown remarkable versatility due to their flexible compression ratios, yet they are constrained by a one-to-one fitting approach that results in lengthy encoding times. Our novel method, ``\textbf{UniCompress}&#39;&#39;, innovatively extends the compression capabilities of INR by being the first to compress multi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16850v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16850v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16850v1-abstract-full" style="display: none;"> In the field of medical image compression, Implicit Neural Representation (INR) networks have shown remarkable versatility due to their flexible compression ratios, yet they are constrained by a one-to-one fitting approach that results in lengthy encoding times. Our novel method, ``\textbf{UniCompress}&#39;&#39;, innovatively extends the compression capabilities of INR by being the first to compress multiple medical data blocks using a single INR network. By employing wavelet transforms and quantization, we introduce a codebook containing frequency domain information as a prior input to the INR network. This enhances the representational power of INR and provides distinctive conditioning for different image blocks. Furthermore, our research introduces a new technique for the knowledge distillation of implicit representations, simplifying complex model knowledge into more manageable formats to improve compression ratios. Extensive testing on CT and electron microscopy (EM) datasets has demonstrated that UniCompress outperforms traditional INR methods and commercial compression solutions like HEVC, especially in complex and high compression scenarios. Notably, compared to existing INR techniques, UniCompress achieves a 4$\sim$5 times increase in compression speed, marking a significant advancement in the field of medical image compression. Codes will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16850v1-abstract-full').style.display = 'none'; document.getElementById('2405.16850v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04963">arXiv:2405.04963</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.04963">pdf</a>, <a href="https://arxiv.org/format/2405.04963">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3658235">10.1145/3658235 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Audio Matters Too! Enhancing Markerless Motion Capture with Audio Signals for String Performance Capture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yitong Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+Z">Zhiping Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Y">Yi Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+S">Shuangpeng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chongwu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+D">Donghao Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Jiachen Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Z">Zhenghao Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiaobing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tao Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qionghai Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04963v1-abstract-short" style="display: inline;"> In this paper, we touch on the problem of markerless multi-modal human motion capture especially for string performance capture which involves inherently subtle hand-string contacts and intricate movements. To fulfill this goal, we first collect a dataset, named String Performance Dataset (SPD), featuring cello and violin performances. The dataset includes videos captured from up to 23 different v&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04963v1-abstract-full').style.display = 'inline'; document.getElementById('2405.04963v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04963v1-abstract-full" style="display: none;"> In this paper, we touch on the problem of markerless multi-modal human motion capture especially for string performance capture which involves inherently subtle hand-string contacts and intricate movements. To fulfill this goal, we first collect a dataset, named String Performance Dataset (SPD), featuring cello and violin performances. The dataset includes videos captured from up to 23 different views, audio signals, and detailed 3D motion annotations of the body, hands, instrument, and bow. Moreover, to acquire the detailed motion annotations, we propose an audio-guided multi-modal motion capture framework that explicitly incorporates hand-string contacts detected from the audio signals for solving detailed hand poses. This framework serves as a baseline for string performance capture in a completely markerless manner without imposing any external devices on performers, eliminating the potential of introducing distortion in such delicate movements. We argue that the movements of performers, particularly the sound-producing gestures, contain subtle information often elusive to visual methods but can be inferred and retrieved from audio cues. Consequently, we refine the vision-based motion capture results through our innovative audio-guided approach, simultaneously clarifying the contact relationship between the performer and the instrument, as deduced from the audio. We validate the proposed framework and conduct ablation studies to demonstrate its efficacy. Our results outperform current state-of-the-art vision-based algorithms, underscoring the feasibility of augmenting visual motion capture with audio modality. To the best of our knowledge, SPD is the first dataset for musical instrument performance, covering fine-grained hand motion details in a multi-modal, large-scale collection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04963v1-abstract-full').style.display = 'none'; document.getElementById('2405.04963v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SIGGRAPH2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14759">arXiv:2404.14759</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.14759">pdf</a>, <a href="https://arxiv.org/format/2404.14759">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.24963/ijcai.2024/179">10.24963/ijcai.2024/179 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Unified Unsupervised Salient Object Detection via Knowledge Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Y">Yao Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wutao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+P">Pan Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qun Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+J">Jie Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14759v2-abstract-short" style="display: inline;"> Recently, unsupervised salient object detection (USOD) has gained increasing attention due to its annotation-free nature. However, current methods mainly focus on specific tasks such as RGB and RGB-D, neglecting the potential for task migration. In this paper, we propose a unified USOD framework for generic USOD tasks. Firstly, we propose a Progressive Curriculum Learning-based Saliency Distilling&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14759v2-abstract-full').style.display = 'inline'; document.getElementById('2404.14759v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14759v2-abstract-full" style="display: none;"> Recently, unsupervised salient object detection (USOD) has gained increasing attention due to its annotation-free nature. However, current methods mainly focus on specific tasks such as RGB and RGB-D, neglecting the potential for task migration. In this paper, we propose a unified USOD framework for generic USOD tasks. Firstly, we propose a Progressive Curriculum Learning-based Saliency Distilling (PCL-SD) mechanism to extract saliency cues from a pre-trained deep network. This mechanism starts with easy samples and progressively moves towards harder ones, to avoid initial interference caused by hard samples. Afterwards, the obtained saliency cues are utilized to train a saliency detector, and we employ a Self-rectify Pseudo-label Refinement (SPR) mechanism to improve the quality of pseudo-labels. Finally, an adapter-tuning method is devised to transfer the acquired saliency knowledge, leveraging shared knowledge to attain superior transferring performance on the target tasks. Extensive experiments on five representative SOD tasks confirm the effectiveness and feasibility of our proposed method. Code and supplement materials are available at https://github.com/I2-Multimedia-Lab/A2S-v3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14759v2-abstract-full').style.display = 'none'; document.getElementById('2404.14759v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IJCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13501">arXiv:2404.13501</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.13501">pdf</a>, <a href="https://arxiv.org/format/2404.13501">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Survey on the Memory Mechanism of Large Language Model based Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Bo%2C+X">Xiaohe Bo</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+C">Chen Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jieming Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zhenhua Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13501v1-abstract-short" style="display: inline;"> Large language model (LLM) based agents have recently attracted much attention from the research and industry communities. Compared with original LLMs, LLM-based agents are featured in their self-evolving capability, which is the basis for solving real-world problems that need long-term and complex agent-environment interactions. The key component to support agent-environment interactions is the m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13501v1-abstract-full').style.display = 'inline'; document.getElementById('2404.13501v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13501v1-abstract-full" style="display: none;"> Large language model (LLM) based agents have recently attracted much attention from the research and industry communities. Compared with original LLMs, LLM-based agents are featured in their self-evolving capability, which is the basis for solving real-world problems that need long-term and complex agent-environment interactions. The key component to support agent-environment interactions is the memory of the agents. While previous studies have proposed many promising memory mechanisms, they are scattered in different papers, and there lacks a systematical review to summarize and compare these works from a holistic perspective, failing to abstract common and effective designing patterns for inspiring future studies. To bridge this gap, in this paper, we propose a comprehensive survey on the memory mechanism of LLM-based agents. In specific, we first discuss &#39;&#39;what is&#39;&#39; and &#39;&#39;why do we need&#39;&#39; the memory in LLM-based agents. Then, we systematically review previous studies on how to design and evaluate the memory module. In addition, we also present many agent applications, where the memory module plays an important role. At last, we analyze the limitations of existing work and show important future directions. To keep up with the latest advances in this field, we create a repository at \url{https://github.com/nuster1128/LLM_Agent_Memory_Survey}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13501v1-abstract-full').style.display = 'none'; document.getElementById('2404.13501v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">39 pages, 5 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.11998">arXiv:2404.11998</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.11998">pdf</a>, <a href="https://arxiv.org/format/2404.11998">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Curriculum Point Prompting for Weakly-Supervised Referring Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qiyuan Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Sibei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.11998v1-abstract-short" style="display: inline;"> Referring image segmentation (RIS) aims to precisely segment referents in images through corresponding natural language expressions, yet relying on cost-intensive mask annotations. Weakly supervised RIS thus learns from image-text pairs to pixel-level semantics, which is challenging for segmenting fine-grained masks. A natural approach to enhancing segmentation precision is to empower weakly super&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.11998v1-abstract-full').style.display = 'inline'; document.getElementById('2404.11998v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.11998v1-abstract-full" style="display: none;"> Referring image segmentation (RIS) aims to precisely segment referents in images through corresponding natural language expressions, yet relying on cost-intensive mask annotations. Weakly supervised RIS thus learns from image-text pairs to pixel-level semantics, which is challenging for segmenting fine-grained masks. A natural approach to enhancing segmentation precision is to empower weakly supervised RIS with the image segmentation foundation model SAM. Nevertheless, we observe that simply integrating SAM yields limited benefits and can even lead to performance regression due to the inevitable noise issues and challenges in excessive focus on object parts. In this paper, we present an innovative framework, Point PrompTing (PPT), incorporated with the proposed multi-source curriculum learning strategy to address these challenges. Specifically, the core of PPT is a point generator that not only harnesses CLIP&#39;s text-image alignment capability and SAM&#39;s powerful mask generation ability but also generates negative point prompts to address the noisy and excessive focus issues inherently and effectively. In addition, we introduce a curriculum learning strategy with object-centric images to help PPT gradually learn from simpler yet precise semantic alignment to more complex RIS. Experiments demonstrate that our PPT significantly and consistently outperforms prior weakly supervised techniques on mIoU by 11.34%, 14.14%, and 6.97% across RefCOCO, RefCOCO+, and G-Ref, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.11998v1-abstract-full').style.display = 'none'; document.getElementById('2404.11998v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07551">arXiv:2404.07551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.07551">pdf</a>, <a href="https://arxiv.org/format/2404.07551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Event-Enhanced Snapshot Compressive Videography at 10K FPS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Suo%2C+J">Jinli Suo</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qionghai Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07551v1-abstract-short" style="display: inline;"> Video snapshot compressive imaging (SCI) encodes the target dynamic scene compactly into a snapshot and reconstructs its high-speed frame sequence afterward, greatly reducing the required data footprint and transmission bandwidth as well as enabling high-speed imaging with a low frame rate intensity camera. In implementation, high-speed dynamics are encoded via temporally varying patterns, and onl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07551v1-abstract-full').style.display = 'inline'; document.getElementById('2404.07551v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07551v1-abstract-full" style="display: none;"> Video snapshot compressive imaging (SCI) encodes the target dynamic scene compactly into a snapshot and reconstructs its high-speed frame sequence afterward, greatly reducing the required data footprint and transmission bandwidth as well as enabling high-speed imaging with a low frame rate intensity camera. In implementation, high-speed dynamics are encoded via temporally varying patterns, and only frames at corresponding temporal intervals can be reconstructed, while the dynamics occurring between consecutive frames are lost. To unlock the potential of conventional snapshot compressive videography, we propose a novel hybrid &#34;intensity+event&#34; imaging scheme by incorporating an event camera into a video SCI setup. Our proposed system consists of a dual-path optical setup to record the coded intensity measurement and intermediate event signals simultaneously, which is compact and photon-efficient by collecting the half photons discarded in conventional video SCI. Correspondingly, we developed a dual-branch Transformer utilizing the reciprocal relationship between two data modes to decode dense video frames. Extensive experiments on both simulated and real-captured data demonstrate our superiority to state-of-the-art video SCI and video frame interpolation (VFI) methods. Benefiting from the new hybrid design leveraging both intrinsic redundancy in videos and the unique feature of event cameras, we achieve high-quality videography at 0.1ms time intervals with a low-cost CMOS image sensor working at 24 FPS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07551v1-abstract-full').style.display = 'none'; document.getElementById('2404.07551v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00621">arXiv:2404.00621</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.00621">pdf</a>, <a href="https://arxiv.org/format/2404.00621">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Pretraining, Adaptation, and Generation for Recommendation: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qijiong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jieming Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yanting Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+Z">Zhaocheng Du</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiao-Ming Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhou Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zhenhua Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00621v2-abstract-short" style="display: inline;"> Personalized recommendation serves as a ubiquitous channel for users to discover information tailored to their interests. However, traditional recommendation models primarily rely on unique IDs and categorical features for user-item matching, potentially overlooking the nuanced essence of raw item contents across multiple modalities such as text, image, audio, and video. This underutilization of m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00621v2-abstract-full').style.display = 'inline'; document.getElementById('2404.00621v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00621v2-abstract-full" style="display: none;"> Personalized recommendation serves as a ubiquitous channel for users to discover information tailored to their interests. However, traditional recommendation models primarily rely on unique IDs and categorical features for user-item matching, potentially overlooking the nuanced essence of raw item contents across multiple modalities such as text, image, audio, and video. This underutilization of multimodal data poses a limitation to recommender systems, especially in multimedia services like news, music, and short-video platforms. The recent advancements in large multimodal models offer new opportunities and challenges in developing content-aware recommender systems. This survey seeks to provide a comprehensive exploration of the latest advancements and future trajectories in multimodal pretraining, adaptation, and generation techniques, as well as their applications in enhancing recommender systems. Furthermore, we discuss current open challenges and opportunities for future research in this dynamic domain. We believe that this survey, alongside the curated resources, will provide valuable insights to inspire further advancements in this evolving landscape. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00621v2-abstract-full').style.display = 'none'; document.getElementById('2404.00621v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by KDD 2024. See our tutorial materials at https://mmrec.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15853">arXiv:2403.15853</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.15853">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An edge detection-based deep learning approach for tear meniscus height measurement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kesheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kunhui Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiaoyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+C">Chunlei He</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianfeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+D">Dexing Kong</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Shoujun Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15853v1-abstract-short" style="display: inline;"> Automatic measurements of tear meniscus height (TMH) have been achieved by using deep learning techniques; however, annotation is significantly influenced by subjective factors and is both time-consuming and labor-intensive. In this paper, we introduce an automatic TMH measurement technique based on edge detection-assisted annotation within a deep learning framework. This method generates mask lab&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15853v1-abstract-full').style.display = 'inline'; document.getElementById('2403.15853v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15853v1-abstract-full" style="display: none;"> Automatic measurements of tear meniscus height (TMH) have been achieved by using deep learning techniques; however, annotation is significantly influenced by subjective factors and is both time-consuming and labor-intensive. In this paper, we introduce an automatic TMH measurement technique based on edge detection-assisted annotation within a deep learning framework. This method generates mask labels less affected by subjective factors with enhanced efficiency compared to previous annotation approaches. For improved segmentation of the pupil and tear meniscus areas, the convolutional neural network Inceptionv3 was first implemented as an image quality assessment model, effectively identifying higher-quality images with an accuracy of 98.224%. Subsequently, by using the generated labels, various algorithms, including Unet, ResUnet, Deeplabv3+FcnResnet101, Deeplabv3+FcnResnet50, FcnResnet50, and FcnResnet101 were trained, with Unet demonstrating the best performance. Finally, Unet was used for automatic pupil and tear meniscus segmentation to locate the center of the pupil and calculate TMH,respectively. An evaluation of the mask quality predicted by Unet indicated a Mean Intersection over Union of 0.9362, a recall of 0.9261, a precision of 0.9423, and an F1-Score of 0.9326. Additionally, the TMH predicted by the model was assessed, with the fitting curve represented as y= 0.982x-0.862, an overall correlation coefficient of r^2=0.961 , and an accuracy of 94.80% (237/250). In summary, the algorithm can automatically screen images based on their quality,segment the pupil and tear meniscus areas, and automatically measure TMH. Measurement results using the AI algorithm demonstrate a high level of consistency with manual measurements, offering significant support to clinical doctors in diagnosing dry eye disease. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15853v1-abstract-full').style.display = 'none'; document.getElementById('2403.15853v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11803">arXiv:2403.11803</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.11803">pdf</a>, <a href="https://arxiv.org/format/2403.11803">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Federated Modality-specific Encoders and Multimodal Anchors for Personalized Brain Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qian Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+D">Dong Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+J">Jinghan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liansheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Y">Yefeng Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11803v1-abstract-short" style="display: inline;"> Most existing federated learning (FL) methods for medical image analysis only considered intramodal heterogeneity, limiting their applicability to multimodal imaging applications. In practice, it is not uncommon that some FL participants only possess a subset of the complete imaging modalities, posing inter-modal heterogeneity as a challenge to effectively training a global model on all participan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11803v1-abstract-full').style.display = 'inline'; document.getElementById('2403.11803v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11803v1-abstract-full" style="display: none;"> Most existing federated learning (FL) methods for medical image analysis only considered intramodal heterogeneity, limiting their applicability to multimodal imaging applications. In practice, it is not uncommon that some FL participants only possess a subset of the complete imaging modalities, posing inter-modal heterogeneity as a challenge to effectively training a global model on all participants&#39; data. In addition, each participant would expect to obtain a personalized model tailored for its local data characteristics from the FL in such a scenario. In this work, we propose a new FL framework with federated modality-specific encoders and multimodal anchors (FedMEMA) to simultaneously address the two concurrent issues. Above all, FedMEMA employs an exclusive encoder for each modality to account for the inter-modal heterogeneity in the first place. In the meantime, while the encoders are shared by the participants, the decoders are personalized to meet individual needs. Specifically, a server with full-modal data employs a fusion decoder to aggregate and fuse representations from all modality-specific encoders, thus bridging the modalities to optimize the encoders via backpropagation reversely. Meanwhile, multiple anchors are extracted from the fused multimodal representations and distributed to the clients in addition to the encoder parameters. On the other end, the clients with incomplete modalities calibrate their missing-modal representations toward the global full-modal anchors via scaled dot-product cross-attention, making up the information loss due to absent modalities while adapting the representations of present ones. FedMEMA is validated on the BraTS 2020 benchmark for multimodal brain tumor segmentation. Results show that it outperforms various up-to-date methods for multimodal and personalized FL and that its novel designs are effective. Our code is available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11803v1-abstract-full').style.display = 'none'; document.getElementById('2403.11803v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.04736">arXiv:2403.04736</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.04736">pdf</a>, <a href="https://arxiv.org/format/2403.04736">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking News Recommendation in the Era of Green AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qijiong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jieming Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiao-Ming Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.04736v2-abstract-short" style="display: inline;"> Over recent years, news recommender systems have gained significant attention in both academia and industry, emphasizing the need for a standardized benchmark to evaluate and compare the performance of these systems. Concurrently, Green AI advocates for reducing the energy consumption and environmental impact of machine learning. To address these concerns, we introduce the first Green AI benchmark&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04736v2-abstract-full').style.display = 'inline'; document.getElementById('2403.04736v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.04736v2-abstract-full" style="display: none;"> Over recent years, news recommender systems have gained significant attention in both academia and industry, emphasizing the need for a standardized benchmark to evaluate and compare the performance of these systems. Concurrently, Green AI advocates for reducing the energy consumption and environmental impact of machine learning. To address these concerns, we introduce the first Green AI benchmarking framework for news recommendation, known as GreenRec, and propose a metric for assessing the tradeoff between recommendation accuracy and efficiency. Our benchmark encompasses 30 base models and their variants, covering traditional end-to-end training paradigms as well as our proposed efficient only-encode-once (OLEO) paradigm. Through experiments consuming 2000 GPU hours, we observe that the OLEO paradigm achieves competitive accuracy compared to state-of-the-art end-to-end paradigms and delivers up to a 2992\% improvement in sustainability metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04736v2-abstract-full').style.display = 'none'; document.getElementById('2403.04736v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">TheWebConf&#39;24 accepted paper. A revised and condensed version of the previous work titled Only Encode Once: Making Content-based News Recommender Greener. While the core ideas and results remain consistent, the presentation scope have been modified for brevity and clarity. For the full details and extended discussions, please refer to the original long paper at arXiv:2308.14155</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.18092">arXiv:2402.18092</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.18092">pdf</a>, <a href="https://arxiv.org/format/2402.18092">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Context-aware Talking Face Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xuanyuan%2C+M">Meidai Xuanyuan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuwang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">Honglei Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qionghai Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.18092v1-abstract-short" style="display: inline;"> In this paper, we consider a novel and practical case for talking face video generation. Specifically, we focus on the scenarios involving multi-people interactions, where the talking context, such as audience or surroundings, is present. In these situations, the video generation should take the context into consideration in order to generate video content naturally aligned with driving audios and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18092v1-abstract-full').style.display = 'inline'; document.getElementById('2402.18092v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.18092v1-abstract-full" style="display: none;"> In this paper, we consider a novel and practical case for talking face video generation. Specifically, we focus on the scenarios involving multi-people interactions, where the talking context, such as audience or surroundings, is present. In these situations, the video generation should take the context into consideration in order to generate video content naturally aligned with driving audios and spatially coherent to the context. To achieve this, we provide a two-stage and cross-modal controllable video generation pipeline, taking facial landmarks as an explicit and compact control signal to bridge the driving audio, talking context and generated videos. Inside this pipeline, we devise a 3D video diffusion model, allowing for efficient contort of both spatial conditions (landmarks and context video), as well as audio condition for temporally coherent generation. The experimental results verify the advantage of the proposed method over other baselines in terms of audio-video synchronization, video fidelity and frame consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18092v1-abstract-full').style.display = 'none'; document.getElementById('2402.18092v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.03307">arXiv:2402.03307</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.03307">pdf</a>, <a href="https://arxiv.org/format/2402.03307">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 4D-Rotor Gaussian Splatting: Towards Efficient Novel View Synthesis for Dynamic Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Duan%2C+Y">Yuanxing Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+F">Fangyin Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qiyu Dai</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yuhang He</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Wenzheng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Baoquan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.03307v3-abstract-short" style="display: inline;"> We consider the problem of novel-view synthesis (NVS) for dynamic scenes. Recent neural approaches have accomplished exceptional NVS results for static 3D scenes, but extensions to 4D time-varying scenes remain non-trivial. Prior efforts often encode dynamics by learning a canonical space plus implicit or explicit deformation fields, which struggle in challenging scenarios like sudden movements or&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03307v3-abstract-full').style.display = 'inline'; document.getElementById('2402.03307v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.03307v3-abstract-full" style="display: none;"> We consider the problem of novel-view synthesis (NVS) for dynamic scenes. Recent neural approaches have accomplished exceptional NVS results for static 3D scenes, but extensions to 4D time-varying scenes remain non-trivial. Prior efforts often encode dynamics by learning a canonical space plus implicit or explicit deformation fields, which struggle in challenging scenarios like sudden movements or generating high-fidelity renderings. In this paper, we introduce 4D Gaussian Splatting (4DRotorGS), a novel method that represents dynamic scenes with anisotropic 4D XYZT Gaussians, inspired by the success of 3D Gaussian Splatting in static scenes. We model dynamics at each timestamp by temporally slicing the 4D Gaussians, which naturally compose dynamic 3D Gaussians and can be seamlessly projected into images. As an explicit spatial-temporal representation, 4DRotorGS demonstrates powerful capabilities for modeling complicated dynamics and fine details--especially for scenes with abrupt motions. We further implement our temporal slicing and splatting techniques in a highly optimized CUDA acceleration framework, achieving real-time inference rendering speeds of up to 277 FPS on an RTX 3090 GPU and 583 FPS on an RTX 4090 GPU. Rigorous evaluations on scenes with diverse motions showcase the superior efficiency and effectiveness of 4DRotorGS, which consistently outperforms existing methods both quantitatively and qualitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03307v3-abstract-full').style.display = 'none'; document.getElementById('2402.03307v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proc. SIGGRAPH, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.05641">arXiv:2401.05641</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.05641">pdf</a>, <a href="https://arxiv.org/format/2401.05641">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Operating Systems">cs.OS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> When eBPF Meets Machine Learning: On-the-fly OS Kernel Compartmentalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zicheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tiejin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qinrun Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yueqi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+H">Hua Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Q">Qingkai Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.05641v1-abstract-short" style="display: inline;"> Compartmentalization effectively prevents initial corruption from turning into a successful attack. This paper presents O2C, a pioneering system designed to enforce OS kernel compartmentalization on the fly. It not only provides immediate remediation for sudden threats but also maintains consistent system availability through the enforcement process. O2C is empowered by the newest advancements o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05641v1-abstract-full').style.display = 'inline'; document.getElementById('2401.05641v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.05641v1-abstract-full" style="display: none;"> Compartmentalization effectively prevents initial corruption from turning into a successful attack. This paper presents O2C, a pioneering system designed to enforce OS kernel compartmentalization on the fly. It not only provides immediate remediation for sudden threats but also maintains consistent system availability through the enforcement process. O2C is empowered by the newest advancements of the eBPF ecosystem which allows to instrument eBPF programs that perform enforcement actions into the kernel at runtime. O2C takes the lead in embedding a machine learning model into eBPF programs, addressing unique challenges in on-the-fly compartmentalization. Our comprehensive evaluation shows that O2C effectively confines damage within the compartment. Further, we validate that decision tree is optimally suited for O2C owing to its advantages in processing tabular data, its explainable nature, and its compliance with the eBPF ecosystem. Last but not least, O2C is lightweight, showing negligible overhead and excellent sacalability system-wide. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05641v1-abstract-full').style.display = 'none'; document.getElementById('2401.05641v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.03153">arXiv:2401.03153</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.03153">pdf</a>, <a href="https://arxiv.org/format/2401.03153">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Event-Oriented Diffusion-Refinement Method for Sparse Events Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+Y">Yuqi Han</a>, <a href="/search/cs?searchtype=author&amp;query=Suo%2C+J">Jinli Suo</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qionghai Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.03153v1-abstract-short" style="display: inline;"> Event cameras or dynamic vision sensors (DVS) record asynchronous response to brightness changes instead of conventional intensity frames, and feature ultra-high sensitivity at low bandwidth. The new mechanism demonstrates great advantages in challenging scenarios with fast motion and large dynamic range. However, the recorded events might be highly sparse due to either limited hardware bandwidth&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03153v1-abstract-full').style.display = 'inline'; document.getElementById('2401.03153v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.03153v1-abstract-full" style="display: none;"> Event cameras or dynamic vision sensors (DVS) record asynchronous response to brightness changes instead of conventional intensity frames, and feature ultra-high sensitivity at low bandwidth. The new mechanism demonstrates great advantages in challenging scenarios with fast motion and large dynamic range. However, the recorded events might be highly sparse due to either limited hardware bandwidth or extreme photon starvation in harsh environments. To unlock the full potential of event cameras, we propose an inventive event sequence completion approach conforming to the unique characteristics of event data in both the processing stage and the output form. Specifically, we treat event streams as 3D event clouds in the spatiotemporal domain, develop a diffusion-based generative model to generate dense clouds in a coarse-to-fine manner, and recover exact timestamps to maintain the temporal resolution of raw data successfully. To validate the effectiveness of our method comprehensively, we perform extensive experiments on three widely used public datasets with different spatial resolutions, and additionally collect a novel event dataset covering diverse scenarios with highly dynamic motions and under harsh illumination. Besides generating high-quality dense events, our method can benefit downstream applications such as object classification and intensity frame reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03153v1-abstract-full').style.display = 'none'; document.getElementById('2401.03153v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.18837">arXiv:2311.18837</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.18837">pdf</a>, <a href="https://arxiv.org/format/2311.18837">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> VIDiff: Translating Videos via Multi-Modal Instructions with Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xing%2C+Z">Zhen Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zihao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Han Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18837v1-abstract-short" style="display: inline;"> Diffusion models have achieved significant success in image and video generation. This motivates a growing interest in video editing tasks, where videos are edited according to provided text descriptions. However, most existing approaches only focus on video editing for short clips and rely on time-consuming tuning or inference. We are the first to propose Video Instruction Diffusion (VIDiff), a u&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18837v1-abstract-full').style.display = 'inline'; document.getElementById('2311.18837v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18837v1-abstract-full" style="display: none;"> Diffusion models have achieved significant success in image and video generation. This motivates a growing interest in video editing tasks, where videos are edited according to provided text descriptions. However, most existing approaches only focus on video editing for short clips and rely on time-consuming tuning or inference. We are the first to propose Video Instruction Diffusion (VIDiff), a unified foundation model designed for a wide range of video tasks. These tasks encompass both understanding tasks (such as language-guided video object segmentation) and generative tasks (video editing and enhancement). Our model can edit and translate the desired results within seconds based on user instructions. Moreover, we design an iterative auto-regressive method to ensure consistency in editing and enhancing long videos. We provide convincing generative results for diverse input videos and written instructions, both qualitatively and quantitatively. More examples can be found at our website https://ChenHsing.github.io/VIDiff. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18837v1-abstract-full').style.display = 'none'; document.getElementById('2311.18837v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.18834">arXiv:2311.18834</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.18834">pdf</a>, <a href="https://arxiv.org/format/2311.18834">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ART$\boldsymbol{\cdot}$V: Auto-Regressive Text-to-Video Generation with Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Weng%2C+W">Wenming Weng</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+R">Ruoyu Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yanhui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chunyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+D">Dacheng Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhiyuan Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+K">Kai Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+J">Jianmin Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Y">Yuhui Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yueyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Z">Zhiwei Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18834v1-abstract-short" style="display: inline;"> We present ART$\boldsymbol{\cdot}$V, an efficient framework for auto-regressive video generation with diffusion models. Unlike existing methods that generate entire videos in one-shot, ART$\boldsymbol{\cdot}$V generates a single frame at a time, conditioned on the previous ones. The framework offers three distinct advantages. First, it only learns simple continual motions between adjacent frames,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18834v1-abstract-full').style.display = 'inline'; document.getElementById('2311.18834v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18834v1-abstract-full" style="display: none;"> We present ART$\boldsymbol{\cdot}$V, an efficient framework for auto-regressive video generation with diffusion models. Unlike existing methods that generate entire videos in one-shot, ART$\boldsymbol{\cdot}$V generates a single frame at a time, conditioned on the previous ones. The framework offers three distinct advantages. First, it only learns simple continual motions between adjacent frames, therefore avoiding modeling complex long-range motions that require huge training data. Second, it preserves the high-fidelity generation ability of the pre-trained image diffusion models by making only minimal network modifications. Third, it can generate arbitrarily long videos conditioned on a variety of prompts such as text, image or their combinations, making it highly versatile and flexible. To combat the common drifting issue in AR models, we propose masked diffusion model which implicitly learns which information can be drawn from reference images rather than network predictions, in order to reduce the risk of generating inconsistent appearances that cause drifting. Moreover, we further enhance generation coherence by conditioning it on the initial frame, which typically contains minimal noise. This is particularly useful for long video generation. When trained for only two weeks on four GPUs, ART$\boldsymbol{\cdot}$V already can generate videos with natural motions, rich details and a high level of aesthetic quality. Besides, it enables various appealing applications, e.g., composing a long video from multiple text prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18834v1-abstract-full').style.display = 'none'; document.getElementById('2311.18834v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 21 figures. Project page at https://warranweng.github.io/art.v</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Dai%2C+Q&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Dai%2C+Q&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Dai%2C+Q&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Dai%2C+Q&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Dai%2C+Q&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10