Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 229 results for author: <span class="mathjax">Ghanem, B</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Ghanem%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ghanem, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ghanem%2C+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ghanem, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14974">arXiv:2411.14974</a> <span> [<a href="https://arxiv.org/pdf/2411.14974">pdf</a>, <a href="https://arxiv.org/format/2411.14974">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D Convex Splatting: Radiance Field Rendering with 3D Smooth Convexes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Held%2C+J">Jan Held</a>, <a href="/search/cs?searchtype=author&query=Vandeghen%2C+R">Renaud Vandeghen</a>, <a href="/search/cs?searchtype=author&query=Hamdi%2C+A">Abdullah Hamdi</a>, <a href="/search/cs?searchtype=author&query=Deliege%2C+A">Adrien Deliege</a>, <a href="/search/cs?searchtype=author&query=Cioppa%2C+A">Anthony Cioppa</a>, <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Vedaldi%2C+A">Andrea Vedaldi</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Van+Droogenbroeck%2C+M">Marc Van Droogenbroeck</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14974v2-abstract-short" style="display: inline;"> Recent advances in radiance field reconstruction, such as 3D Gaussian Splatting (3DGS), have achieved high-quality novel view synthesis and fast rendering by representing scenes with compositions of Gaussian primitives. However, 3D Gaussians present several limitations for scene reconstruction. Accurately capturing hard edges is challenging without significantly increasing the number of Gaussians,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14974v2-abstract-full').style.display = 'inline'; document.getElementById('2411.14974v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14974v2-abstract-full" style="display: none;"> Recent advances in radiance field reconstruction, such as 3D Gaussian Splatting (3DGS), have achieved high-quality novel view synthesis and fast rendering by representing scenes with compositions of Gaussian primitives. However, 3D Gaussians present several limitations for scene reconstruction. Accurately capturing hard edges is challenging without significantly increasing the number of Gaussians, creating a large memory footprint. Moreover, they struggle to represent flat surfaces, as they are diffused in space. Without hand-crafted regularizers, they tend to disperse irregularly around the actual surface. To circumvent these issues, we introduce a novel method, named 3D Convex Splatting (3DCS), which leverages 3D smooth convexes as primitives for modeling geometrically-meaningful radiance fields from multi-view images. Smooth convex shapes offer greater flexibility than Gaussians, allowing for a better representation of 3D scenes with hard edges and dense volumes using fewer primitives. Powered by our efficient CUDA-based rasterizer, 3DCS achieves superior performance over 3DGS on benchmarks such as Mip-NeRF360, Tanks and Temples, and Deep Blending. Specifically, our method attains an improvement of up to 0.81 in PSNR and 0.026 in LPIPS compared to 3DGS while maintaining high rendering speeds and reducing the number of required primitives. Our results highlight the potential of 3D Convex Splatting to become the new standard for high-quality scene reconstruction and novel view synthesis. Project page: convexsplatting.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14974v2-abstract-full').style.display = 'none'; document.getElementById('2411.14974v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 13 figures, 10 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12293">arXiv:2411.12293</a> <span> [<a href="https://arxiv.org/pdf/2411.12293">pdf</a>, <a href="https://arxiv.org/format/2411.12293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Generative Timelines for Instructed Visual Assembly </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pardo%2C+A">Alejandro Pardo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jui-Hsien Wang</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Sivic%2C+J">Josef Sivic</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+B">Bryan Russell</a>, <a href="/search/cs?searchtype=author&query=Heilbron%2C+F+C">Fabian Caba Heilbron</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12293v1-abstract-short" style="display: inline;"> The objective of this work is to manipulate visual timelines (e.g. a video) through natural language instructions, making complex timeline editing tasks accessible to non-expert or potentially even disabled users. We call this task Instructed visual assembly. This task is challenging as it requires (i) identifying relevant visual content in the input timeline as well as retrieving relevant visual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12293v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12293v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12293v1-abstract-full" style="display: none;"> The objective of this work is to manipulate visual timelines (e.g. a video) through natural language instructions, making complex timeline editing tasks accessible to non-expert or potentially even disabled users. We call this task Instructed visual assembly. This task is challenging as it requires (i) identifying relevant visual content in the input timeline as well as retrieving relevant visual content in a given input (video) collection, (ii) understanding the input natural language instruction, and (iii) performing the desired edits of the input visual timeline to produce an output timeline. To address these challenges, we propose the Timeline Assembler, a generative model trained to perform instructed visual assembly tasks. The contributions of this work are three-fold. First, we develop a large multimodal language model, which is designed to process visual content, compactly represent timelines and accurately interpret timeline editing instructions. Second, we introduce a novel method for automatically generating datasets for visual assembly tasks, enabling efficient training of our model without the need for human-labeled data. Third, we validate our approach by creating two novel datasets for image and video assembly, demonstrating that the Timeline Assembler substantially outperforms established baseline models, including the recent GPT-4o, in accurately executing complex assembly instructions across various real-world inspired scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12293v1-abstract-full').style.display = 'none'; document.getElementById('2411.12293v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11581">arXiv:2411.11581</a> <span> [<a href="https://arxiv.org/pdf/2411.11581">pdf</a>, <a href="https://arxiv.org/format/2411.11581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OASIS: Open Agent Social Interaction Simulations with One Million Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zaibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zirui Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuxian Jiang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Ziyue Gan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zijian Ling</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinsong Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Martz Ma</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+B">Bowen Dong</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+P">Prateek Gupta</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyue Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenfei Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guohao Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xu Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijun Wang</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chaochao Lu</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jing Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11581v4-abstract-short" style="display: inline;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a parti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v4-abstract-full').style.display = 'inline'; document.getElementById('2411.11581v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11581v4-abstract-full" style="display: none;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a particular scenario, making it time-consuming and resource-intensive to explore other phenomena using the same ABM. Additionally, these models simulate only a limited number of agents, whereas real-world social media platforms involve millions of users. To this end, we propose OASIS, a generalizable and scalable social media simulator. OASIS is designed based on real-world social media platforms, incorporating dynamically updated environments (i.e., dynamic social networks and post information), diverse action spaces (i.e., following, commenting), and recommendation systems (i.e., interest-based and hot-score-based). Additionally, OASIS supports large-scale user simulations, capable of modeling up to one million users. With these features, OASIS can be easily extended to different social media platforms to study large-scale group phenomena and behaviors. We replicate various social phenomena, including information spreading, group polarization, and herd effects across X and Reddit platforms. Moreover, we provide observations of social phenomena at different agent group scales. We observe that the larger agent group scale leads to more enhanced group dynamics and more diverse and helpful agents' opinions. These findings demonstrate OASIS's potential as a powerful tool for studying complex systems in digital environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v4-abstract-full').style.display = 'none'; document.getElementById('2411.11581v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01304">arXiv:2410.01304</a> <span> [<a href="https://arxiv.org/pdf/2410.01304">pdf</a>, <a href="https://arxiv.org/format/2410.01304">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep learning for action spotting in association football videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Cioppa%2C+A">Anthony Cioppa</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Van+Droogenbroeck%2C+M">Marc Van Droogenbroeck</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01304v1-abstract-short" style="display: inline;"> The task of action spotting consists in both identifying actions and precisely localizing them in time with a single timestamp in long, untrimmed video streams. Automatically extracting those actions is crucial for many sports applications, including sports analytics to produce extended statistics on game actions, coaching to provide support to video analysts, or fan engagement to automatically ov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01304v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01304v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01304v1-abstract-full" style="display: none;"> The task of action spotting consists in both identifying actions and precisely localizing them in time with a single timestamp in long, untrimmed video streams. Automatically extracting those actions is crucial for many sports applications, including sports analytics to produce extended statistics on game actions, coaching to provide support to video analysts, or fan engagement to automatically overlay content in the broadcast when specific actions occur. However, before 2018, no large-scale datasets for action spotting in sports were publicly available, which impeded benchmarking action spotting methods. In response, our team built the largest dataset and the most comprehensive benchmarks for sports video understanding, under the umbrella of SoccerNet. Particularly, our dataset contains a subset specifically dedicated to action spotting, called SoccerNet Action Spotting, containing more than 550 complete broadcast games annotated with almost all types of actions that can occur in a football game. This dataset is tailored to develop methods for automatic spotting of actions of interest, including deep learning approaches, by providing a large amount of manually annotated actions. To engage with the scientific community, the SoccerNet initiative organizes yearly challenges, during which participants from all around the world compete to achieve state-of-the-art performances. Thanks to our dataset and challenges, more than 60 methods were developed or published over the past five years, improving on the first baselines and making action spotting a viable option for the sports industry. This paper traces the history of action spotting in sports, from the creation of the task back in 2018, to the role it plays today in research and the sports industry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01304v1-abstract-full').style.display = 'none'; document.getElementById('2410.01304v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 2 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10587">arXiv:2409.10587</a> <span> [<a href="https://arxiv.org/pdf/2409.10587">pdf</a>, <a href="https://arxiv.org/format/2409.10587">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SoccerNet 2024 Challenges Results </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cioppa%2C+A">Anthony Cioppa</a>, <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Somers%2C+V">Vladimir Somers</a>, <a href="/search/cs?searchtype=author&query=Joos%2C+V">Victor Joos</a>, <a href="/search/cs?searchtype=author&query=Magera%2C+F">Floriane Magera</a>, <a href="/search/cs?searchtype=author&query=Held%2C+J">Jan Held</a>, <a href="/search/cs?searchtype=author&query=Ghasemzadeh%2C+S+A">Seyed Abolfazl Ghasemzadeh</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xin Zhou</a>, <a href="/search/cs?searchtype=author&query=Seweryn%2C+K">Karolina Seweryn</a>, <a href="/search/cs?searchtype=author&query=Kowalczyk%2C+M">Mateusz Kowalczyk</a>, <a href="/search/cs?searchtype=author&query=Mr%C3%B3z%2C+Z">Zuzanna Mr贸z</a>, <a href="/search/cs?searchtype=author&query=%C5%81ukasik%2C+S">Szymon 艁ukasik</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%82o%C5%84%2C+M">Micha艂 Ha艂o艅</a>, <a href="/search/cs?searchtype=author&query=Mkhallati%2C+H">Hassan Mkhallati</a>, <a href="/search/cs?searchtype=author&query=Deli%C3%A8ge%2C+A">Adrien Deli猫ge</a>, <a href="/search/cs?searchtype=author&query=Hinojosa%2C+C">Carlos Hinojosa</a>, <a href="/search/cs?searchtype=author&query=Sanchez%2C+K">Karen Sanchez</a>, <a href="/search/cs?searchtype=author&query=Mansourian%2C+A+M">Amir M. Mansourian</a>, <a href="/search/cs?searchtype=author&query=Miralles%2C+P">Pierre Miralles</a>, <a href="/search/cs?searchtype=author&query=Barnich%2C+O">Olivier Barnich</a>, <a href="/search/cs?searchtype=author&query=De+Vleeschouwer%2C+C">Christophe De Vleeschouwer</a>, <a href="/search/cs?searchtype=author&query=Alahi%2C+A">Alexandre Alahi</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Van+Droogenbroeck%2C+M">Marc Van Droogenbroeck</a>, <a href="/search/cs?searchtype=author&query=Gorski%2C+A">Adam Gorski</a> , et al. (59 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10587v1-abstract-short" style="display: inline;"> The SoccerNet 2024 challenges represent the fourth annual video understanding challenges organized by the SoccerNet team. These challenges aim to advance research across multiple themes in football, including broadcast video understanding, field understanding, and player understanding. This year, the challenges encompass four vision-based tasks. (1) Ball Action Spotting, focusing on precisely loca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10587v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10587v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10587v1-abstract-full" style="display: none;"> The SoccerNet 2024 challenges represent the fourth annual video understanding challenges organized by the SoccerNet team. These challenges aim to advance research across multiple themes in football, including broadcast video understanding, field understanding, and player understanding. This year, the challenges encompass four vision-based tasks. (1) Ball Action Spotting, focusing on precisely localizing when and which soccer actions related to the ball occur, (2) Dense Video Captioning, focusing on describing the broadcast with natural language and anchored timestamps, (3) Multi-View Foul Recognition, a novel task focusing on analyzing multiple viewpoints of a potential foul incident to classify whether a foul occurred and assess its severity, (4) Game State Reconstruction, another novel task focusing on reconstructing the game state from broadcast videos onto a 2D top-view map of the field. Detailed information about the tasks, challenges, and leaderboards can be found at https://www.soccer-net.org, with baselines and development kits available at https://github.com/SoccerNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10587v1-abstract-full').style.display = 'none'; document.getElementById('2409.10587v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13135">arXiv:2408.13135</a> <span> [<a href="https://arxiv.org/pdf/2408.13135">pdf</a>, <a href="https://arxiv.org/format/2408.13135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning at the Intersection: Certified Robustness as a Tool for 3D Vision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=S%2C+G+P">Gabriel P茅rez S</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&query=Alfarra%2C+M">Motasem Alfarra</a>, <a href="/search/cs?searchtype=author&query=Zarzar%2C+J">Jes煤s Zarzar</a>, <a href="/search/cs?searchtype=author&query=Rojas%2C+S">Sara Rojas</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13135v1-abstract-short" style="display: inline;"> This paper presents preliminary work on a novel connection between certified robustness in machine learning and the modeling of 3D objects. We highlight an intriguing link between the Maximal Certified Radius (MCR) of a classifier representing a space's occupancy and the space's Signed Distance Function (SDF). Leveraging this relationship, we propose to use the certification method of randomized s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13135v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13135v1-abstract-full" style="display: none;"> This paper presents preliminary work on a novel connection between certified robustness in machine learning and the modeling of 3D objects. We highlight an intriguing link between the Maximal Certified Radius (MCR) of a classifier representing a space's occupancy and the space's Signed Distance Function (SDF). Leveraging this relationship, we propose to use the certification method of randomized smoothing (RS) to compute SDFs. Since RS' high computational cost prevents its practical usage as a way to compute SDFs, we propose an algorithm to efficiently run RS in low-dimensional applications, such as 3D space, by expressing RS' fundamental operations as Gaussian smoothing on pre-computed voxel grids. Our approach offers an innovative and practical tool to compute SDFs, validated through proof-of-concept experiments in novel view synthesis. This paper bridges two previously disparate areas of machine learning, opening new avenues for further exploration and potential cross-domain advancements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13135v1-abstract-full').style.display = 'none'; document.getElementById('2408.13135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is an accepted extended abstract to the LatinX workshop at ICCV 2023. This was uploaded a year late</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10827">arXiv:2408.10827</a> <span> [<a href="https://arxiv.org/pdf/2408.10827">pdf</a>, <a href="https://arxiv.org/format/2408.10827">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CO2Wounds-V2: Extended Chronic Wounds Dataset From Leprosy Patients </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sanchez%2C+K">Karen Sanchez</a>, <a href="/search/cs?searchtype=author&query=Hinojosa%2C+C">Carlos Hinojosa</a>, <a href="/search/cs?searchtype=author&query=Mieles%2C+O">Olinto Mieles</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Arguello%2C+H">Henry Arguello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10827v1-abstract-short" style="display: inline;"> Chronic wounds pose an ongoing health concern globally, largely due to the prevalence of conditions such as diabetes and leprosy's disease. The standard method of monitoring these wounds involves visual inspection by healthcare professionals, a practice that could present challenges for patients in remote areas with inadequate transportation and healthcare infrastructure. This has led to the devel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10827v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10827v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10827v1-abstract-full" style="display: none;"> Chronic wounds pose an ongoing health concern globally, largely due to the prevalence of conditions such as diabetes and leprosy's disease. The standard method of monitoring these wounds involves visual inspection by healthcare professionals, a practice that could present challenges for patients in remote areas with inadequate transportation and healthcare infrastructure. This has led to the development of algorithms designed for the analysis and follow-up of wound images, which perform image-processing tasks such as classification, detection, and segmentation. However, the effectiveness of these algorithms heavily depends on the availability of comprehensive and varied wound image data, which is usually scarce. This paper introduces the CO2Wounds-V2 dataset, an extended collection of RGB wound images from leprosy patients with their corresponding semantic segmentation annotations, aiming to enhance the development and testing of image-processing algorithms in the medical field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10827v1-abstract-full').style.display = 'none'; document.getElementById('2408.10827v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2024 IEEE International Conference on Image Processing (ICIP 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10739">arXiv:2408.10739</a> <span> [<a href="https://arxiv.org/pdf/2408.10739">pdf</a>, <a href="https://arxiv.org/format/2408.10739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TrackNeRF: Bundle Adjusting NeRF from Sparse and Noisy Views via Feature Tracks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mai%2C+J">Jinjie Mai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenxuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Rojas%2C+S">Sara Rojas</a>, <a href="/search/cs?searchtype=author&query=Zarzar%2C+J">Jesus Zarzar</a>, <a href="/search/cs?searchtype=author&query=Hamdi%2C+A">Abdullah Hamdi</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+G">Guocheng Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bing Li</a>, <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10739v1-abstract-short" style="display: inline;"> Neural radiance fields (NeRFs) generally require many images with accurate poses for accurate novel view synthesis, which does not reflect realistic setups where views can be sparse and poses can be noisy. Previous solutions for learning NeRFs with sparse views and noisy poses only consider local geometry consistency with pairs of views. Closely following \textit{bundle adjustment} in Structure-fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10739v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10739v1-abstract-full" style="display: none;"> Neural radiance fields (NeRFs) generally require many images with accurate poses for accurate novel view synthesis, which does not reflect realistic setups where views can be sparse and poses can be noisy. Previous solutions for learning NeRFs with sparse views and noisy poses only consider local geometry consistency with pairs of views. Closely following \textit{bundle adjustment} in Structure-from-Motion (SfM), we introduce TrackNeRF for more globally consistent geometry reconstruction and more accurate pose optimization. TrackNeRF introduces \textit{feature tracks}, \ie connected pixel trajectories across \textit{all} visible views that correspond to the \textit{same} 3D points. By enforcing reprojection consistency among feature tracks, TrackNeRF encourages holistic 3D consistency explicitly. Through extensive experiments, TrackNeRF sets a new benchmark in noisy and sparse view reconstruction. In particular, TrackNeRF shows significant improvements over the state-of-the-art BARF and SPARF by $\sim8$ and $\sim1$ in terms of PSNR on DTU under various sparse and noisy view setups. The code is available at \href{https://tracknerf.github.io/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10739v1-abstract-full').style.display = 'none'; document.getElementById('2408.10739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024 (supplemental pages included)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17792">arXiv:2407.17792</a> <span> [<a href="https://arxiv.org/pdf/2407.17792">pdf</a>, <a href="https://arxiv.org/format/2407.17792">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Harnessing Temporal Causality for Advanced Temporal Action Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuming Liu</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+L">Lin Sui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chen-Lin Zhang</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+F">Fangzhou Mu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17792v2-abstract-short" style="display: inline;"> As a fundamental task in long-form video understanding, temporal action detection (TAD) aims to capture inherent temporal relations in untrimmed videos and identify candidate actions with precise boundaries. Over the years, various networks, including convolutions, graphs, and transformers, have been explored for effective temporal modeling for TAD. However, these modules typically treat past and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17792v2-abstract-full').style.display = 'inline'; document.getElementById('2407.17792v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17792v2-abstract-full" style="display: none;"> As a fundamental task in long-form video understanding, temporal action detection (TAD) aims to capture inherent temporal relations in untrimmed videos and identify candidate actions with precise boundaries. Over the years, various networks, including convolutions, graphs, and transformers, have been explored for effective temporal modeling for TAD. However, these modules typically treat past and future information equally, overlooking the crucial fact that changes in action boundaries are essentially causal events. Inspired by this insight, we propose leveraging the temporal causality of actions to enhance TAD representation by restricting the model's access to only past or future context. We introduce CausalTAD, which combines causal attention and causal Mamba to achieve state-of-the-art performance on multiple benchmarks. Notably, with CausalTAD, we ranked 1st in the Action Recognition, Action Detection, and Audio-Based Interaction Detection tracks at the EPIC-Kitchens Challenge 2024, as well as 1st in the Moment Queries track at the Ego4D Challenge 2024. Our code is available at https://github.com/sming256/OpenTAD/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17792v2-abstract-full').style.display = 'none'; document.getElementById('2407.17792v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">1st in Moment Queries track at the Ego4D Challenge 2024; 1st in Action Recognition, Action Detection, and Audio-Based Interaction Detection tracks at the EPIC-Kitchens Challenge 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13036">arXiv:2407.13036</a> <span> [<a href="https://arxiv.org/pdf/2407.13036">pdf</a>, <a href="https://arxiv.org/format/2407.13036">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ColorMAE: Exploring data-independent masking strategies in Masked AutoEncoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hinojosa%2C+C">Carlos Hinojosa</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuming Liu</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13036v1-abstract-short" style="display: inline;"> Masked AutoEncoders (MAE) have emerged as a robust self-supervised framework, offering remarkable performance across a wide range of downstream tasks. To increase the difficulty of the pretext task and learn richer visual representations, existing works have focused on replacing standard random masking with more sophisticated strategies, such as adversarial-guided and teacher-guided masking. Howev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13036v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13036v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13036v1-abstract-full" style="display: none;"> Masked AutoEncoders (MAE) have emerged as a robust self-supervised framework, offering remarkable performance across a wide range of downstream tasks. To increase the difficulty of the pretext task and learn richer visual representations, existing works have focused on replacing standard random masking with more sophisticated strategies, such as adversarial-guided and teacher-guided masking. However, these strategies depend on the input data thus commonly increasing the model complexity and requiring additional calculations to generate the mask patterns. This raises the question: Can we enhance MAE performance beyond random masking without relying on input data or incurring additional computational costs? In this work, we introduce a simple yet effective data-independent method, termed ColorMAE, which generates different binary mask patterns by filtering random noise. Drawing inspiration from color noise in image processing, we explore four types of filters to yield mask patterns with different spatial and semantic priors. ColorMAE requires no additional learnable parameters or computational overhead in the network, yet it significantly enhances the learned representations. We provide a comprehensive empirical evaluation, demonstrating our strategy's superiority in downstream tasks compared to random masking. Notably, we report an improvement of 2.72 in mIoU in semantic segmentation tasks relative to baseline MAE implementations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13036v1-abstract-full').style.display = 'none'; document.getElementById('2407.13036v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work Accepted for Publication at ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12483">arXiv:2407.12483</a> <span> [<a href="https://arxiv.org/pdf/2407.12483">pdf</a>, <a href="https://arxiv.org/format/2407.12483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards AI-Powered Video Assistant Referee System (VARS) for Association Football </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Held%2C+J">Jan Held</a>, <a href="/search/cs?searchtype=author&query=Cioppa%2C+A">Anthony Cioppa</a>, <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Hamdi%2C+A">Abdullah Hamdi</a>, <a href="/search/cs?searchtype=author&query=Devue%2C+C">Christel Devue</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Van+Droogenbroeck%2C+M">Marc Van Droogenbroeck</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12483v2-abstract-short" style="display: inline;"> Over the past decade, the technology used by referees in football has improved substantially, enhancing the fairness and accuracy of decisions. This progress has culminated in the implementation of the Video Assistant Referee (VAR), an innovation that enables backstage referees to review incidents on the pitch from multiple points of view. However, the VAR is currently limited to professional leag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12483v2-abstract-full').style.display = 'inline'; document.getElementById('2407.12483v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12483v2-abstract-full" style="display: none;"> Over the past decade, the technology used by referees in football has improved substantially, enhancing the fairness and accuracy of decisions. This progress has culminated in the implementation of the Video Assistant Referee (VAR), an innovation that enables backstage referees to review incidents on the pitch from multiple points of view. However, the VAR is currently limited to professional leagues due to its expensive infrastructure and the lack of referees worldwide. In this paper, we present the semi-automated Video Assistant Referee System (VARS) that leverages the latest findings in multi-view video analysis. VARS sets a new state-of-the-art on the SoccerNet-MVFoul dataset, a multi-view video dataset of football fouls. Our VARS achieves a new state-of-the-art on the SoccerNet-MVFoul dataset by recognizing the type of foul in 50% of instances and the appropriate sanction in 46% of cases. Finally, we conducted a comparative study to investigate human performance in classifying fouls and their corresponding severity and compared these findings to our VARS. The results of our study highlight the potential of our VARS to reach human performance and support football refereeing across all levels of professional and amateur federations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12483v2-abstract-full').style.display = 'none'; document.getElementById('2407.12483v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper is subject to the peer review process of Sports Engineering</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08822">arXiv:2407.08822</a> <span> [<a href="https://arxiv.org/pdf/2407.08822">pdf</a>, <a href="https://arxiv.org/format/2407.08822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FedMedICL: Towards Holistic Evaluation of Distribution Shifts in Federated Medical Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Alhamoud%2C+K">Kumail Alhamoud</a>, <a href="/search/cs?searchtype=author&query=Ghunaim%2C+Y">Yasir Ghunaim</a>, <a href="/search/cs?searchtype=author&query=Alfarra%2C+M">Motasem Alfarra</a>, <a href="/search/cs?searchtype=author&query=Hartvigsen%2C+T">Thomas Hartvigsen</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Bibi%2C+A">Adel Bibi</a>, <a href="/search/cs?searchtype=author&query=Ghassemi%2C+M">Marzyeh Ghassemi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08822v1-abstract-short" style="display: inline;"> For medical imaging AI models to be clinically impactful, they must generalize. However, this goal is hindered by (i) diverse types of distribution shifts, such as temporal, demographic, and label shifts, and (ii) limited diversity in datasets that are siloed within single medical institutions. While these limitations have spurred interest in federated learning, current evaluation benchmarks fail… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08822v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08822v1-abstract-full" style="display: none;"> For medical imaging AI models to be clinically impactful, they must generalize. However, this goal is hindered by (i) diverse types of distribution shifts, such as temporal, demographic, and label shifts, and (ii) limited diversity in datasets that are siloed within single medical institutions. While these limitations have spurred interest in federated learning, current evaluation benchmarks fail to evaluate different shifts simultaneously. However, in real healthcare settings, multiple types of shifts co-exist, yet their impact on medical imaging performance remains unstudied. In response, we introduce FedMedICL, a unified framework and benchmark to holistically evaluate federated medical imaging challenges, simultaneously capturing label, demographic, and temporal distribution shifts. We comprehensively evaluate several popular methods on six diverse medical imaging datasets (totaling 550 GPU hours). Furthermore, we use FedMedICL to simulate COVID-19 propagation across hospitals and evaluate whether methods can adapt to pandemic changes in disease prevalence. We find that a simple batch balancing technique surpasses advanced methods in average performance across FedMedICL experiments. This finding questions the applicability of results from previous, narrow benchmarks in real-world medical settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08822v1-abstract-full').style.display = 'none'; document.getElementById('2407.08822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at MICCAI 2024. Code is available at: https://github.com/m1k2zoo/FedMedICL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08023">arXiv:2407.08023</a> <span> [<a href="https://arxiv.org/pdf/2407.08023">pdf</a>, <a href="https://arxiv.org/format/2407.08023">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Structure-from-Motion and Camera Relocalization for Enhanced Egocentric Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mai%2C+J">Jinjie Mai</a>, <a href="/search/cs?searchtype=author&query=Hamdi%2C+A">Abdullah Hamdi</a>, <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08023v1-abstract-short" style="display: inline;"> We built our pipeline EgoLoc-v1, mainly inspired by EgoLoc. We propose a model ensemble strategy to improve the camera pose estimation part of the VQ3D task, which has been proven to be essential in previous work. The core idea is not only to do SfM for egocentric videos but also to do 2D-3D matching between existing 3D scans and 2D video frames. In this way, we have a hybrid SfM and camera reloca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08023v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08023v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08023v1-abstract-full" style="display: none;"> We built our pipeline EgoLoc-v1, mainly inspired by EgoLoc. We propose a model ensemble strategy to improve the camera pose estimation part of the VQ3D task, which has been proven to be essential in previous work. The core idea is not only to do SfM for egocentric videos but also to do 2D-3D matching between existing 3D scans and 2D video frames. In this way, we have a hybrid SfM and camera relocalization pipeline, which can provide us with more camera poses, leading to higher QwP and overall success rate. Our method achieves the best performance regarding the most important metric, the overall success rate. We surpass previous state-of-the-art, the competitive EgoLoc, by $1.5\%$. The code is available at \url{https://github.com/Wayne-Mai/egoloc_v1}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08023v1-abstract-full').style.display = 'none'; document.getElementById('2407.08023v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">1st place winner of the 2024 Ego4D-Ego-Exo4D Challenge in VQ3D</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06136">arXiv:2407.06136</a> <span> [<a href="https://arxiv.org/pdf/2407.06136">pdf</a>, <a href="https://arxiv.org/format/2407.06136">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mamba-FSCIL: Dynamic Adaptation with Selective State Space Model for Few-Shot Class-Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaojie Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yibo Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianlong Wu</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+L">Liqiang Nie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06136v2-abstract-short" style="display: inline;"> Few-shot class-incremental learning (FSCIL) confronts the challenge of integrating new classes into a model with minimal training samples while preserving the knowledge of previously learned classes. Traditional methods widely adopt static adaptation relying on a fixed parameter space to learn from data that arrive sequentially, prone to overfitting to the current session. Existing dynamic strateg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06136v2-abstract-full').style.display = 'inline'; document.getElementById('2407.06136v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06136v2-abstract-full" style="display: none;"> Few-shot class-incremental learning (FSCIL) confronts the challenge of integrating new classes into a model with minimal training samples while preserving the knowledge of previously learned classes. Traditional methods widely adopt static adaptation relying on a fixed parameter space to learn from data that arrive sequentially, prone to overfitting to the current session. Existing dynamic strategies require the expansion of the parameter space continually, leading to increased complexity. In this study, we explore the potential of Selective State Space Models (SSMs) for FSCIL, leveraging its dynamic weights and strong ability in sequence modeling to address these challenges. Concretely, we propose a dual selective SSM projector that dynamically adjusts the projection parameters based on the intermediate features for dynamic adaptation. The dual design enables the model to maintain the robust features of base classes, while adaptively learning distinctive feature shifts for novel classes. Additionally, we develop a class-sensitive selective scan mechanism to guide dynamic adaptation. It minimizes the disruption to base-class representations caused by training on novel data, and meanwhile, forces the selective scan to perform in distinct patterns between base and novel classes. Experiments on miniImageNet, CUB-200, and CIFAR-100 demonstrate that our framework outperforms the existing state-of-the-art methods. The code is available at \url{https://github.com/xiaojieli0903/Mamba-FSCIL}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06136v2-abstract-full').style.display = 'none'; document.getElementById('2407.06136v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/xiaojieli0903/Mamba-FSCIL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02370">arXiv:2407.02370</a> <span> [<a href="https://arxiv.org/pdf/2407.02370">pdf</a>, <a href="https://arxiv.org/format/2407.02370">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Investigating Event-Based Cameras for Video Frame Interpolation in Sports </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deckyvere%2C+A">Antoine Deckyvere</a>, <a href="/search/cs?searchtype=author&query=Cioppa%2C+A">Anthony Cioppa</a>, <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Van+Droogenbroeck%2C+M">Marc Van Droogenbroeck</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02370v2-abstract-short" style="display: inline;"> Slow-motion replays provide a thrilling perspective on pivotal moments within sports games, offering a fresh and captivating visual experience. However, capturing slow-motion footage typically demands high-tech, expensive cameras and infrastructures. Deep learning Video Frame Interpolation (VFI) techniques have emerged as a promising avenue, capable of generating high-speed footage from regular ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02370v2-abstract-full').style.display = 'inline'; document.getElementById('2407.02370v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02370v2-abstract-full" style="display: none;"> Slow-motion replays provide a thrilling perspective on pivotal moments within sports games, offering a fresh and captivating visual experience. However, capturing slow-motion footage typically demands high-tech, expensive cameras and infrastructures. Deep learning Video Frame Interpolation (VFI) techniques have emerged as a promising avenue, capable of generating high-speed footage from regular camera feeds. Moreover, the utilization of event-based cameras has recently gathered attention as they provide valuable motion information between frames, further enhancing the VFI performances. In this work, we present a first investigation of event-based VFI models for generating sports slow-motion videos. Particularly, we design and implement a bi-camera recording setup, including an RGB and an event-based camera to capture sports videos, to temporally align and spatially register both cameras. Our experimental validation demonstrates that TimeLens, an off-the-shelf event-based VFI model, can effectively generate slow-motion footage for sports videos. This first investigation underscores the practical utility of event-based cameras in producing sports slow-motion content and lays the groundwork for future research endeavors in this domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02370v2-abstract-full').style.display = 'none'; document.getElementById('2407.02370v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01511">arXiv:2407.01511</a> <span> [<a href="https://arxiv.org/pdf/2407.01511">pdf</a>, <a href="https://arxiv.org/format/2407.01511">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CRAB: Cross-environment Agent Benchmark for Multimodal Language Model Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianqi Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Linyao Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Dai-Jie Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zecheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+X">Xiang Yao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zhiqiang Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongchao Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shilong Liu</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+B">Bochen Qian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">Anjie Yang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhaoxuan Jin</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jianbo Deng</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guohao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01511v2-abstract-short" style="display: inline;"> The development of autonomous agents increasingly relies on Multimodal Language Models (MLMs) to perform tasks described in natural language with GUI environments, such as websites, desktop computers, or mobile phones. Existing benchmarks for MLM agents in interactive environments are limited by their focus on a single environment, lack of detailed and generalized evaluation methods, and the compl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01511v2-abstract-full').style.display = 'inline'; document.getElementById('2407.01511v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01511v2-abstract-full" style="display: none;"> The development of autonomous agents increasingly relies on Multimodal Language Models (MLMs) to perform tasks described in natural language with GUI environments, such as websites, desktop computers, or mobile phones. Existing benchmarks for MLM agents in interactive environments are limited by their focus on a single environment, lack of detailed and generalized evaluation methods, and the complexities of constructing tasks and evaluators. To overcome these limitations, we introduce Crab, the first agent benchmark framework designed to support cross-environment tasks, incorporating a graph-based fine-grained evaluation method and an efficient mechanism for task and evaluator construction. Our framework supports multiple devices and can be easily extended to any environment with a Python interface. Leveraging Crab, we developed a cross-platform Crab Benchmark-v0 comprising 120 tasks in computer desktop and mobile phone environments. We evaluated four advanced MLMs using different single and multi-agent system configurations on this benchmark. The experimental results demonstrate that the single agent with GPT-4o achieves the best completion ratio of 38.01%. All framework code, agent code, and task datasets are publicly available at https://github.com/camel-ai/crab. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01511v2-abstract-full').style.display = 'none'; document.getElementById('2407.01511v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01265">arXiv:2407.01265</a> <span> [<a href="https://arxiv.org/pdf/2407.01265">pdf</a>, <a href="https://arxiv.org/format/2407.01265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OSL-ActionSpotting: A Unified Library for Action Spotting in Sports Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Benzakour%2C+Y">Yassine Benzakour</a>, <a href="/search/cs?searchtype=author&query=Cabado%2C+B">Bruno Cabado</a>, <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Cioppa%2C+A">Anthony Cioppa</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Van+Droogenbroeck%2C+M">Marc Van Droogenbroeck</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01265v1-abstract-short" style="display: inline;"> Action spotting is crucial in sports analytics as it enables the precise identification and categorization of pivotal moments in sports matches, providing insights that are essential for performance analysis and tactical decision-making. The fragmentation of existing methodologies, however, impedes the progression of sports analytics, necessitating a unified codebase to support the development and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01265v1-abstract-full').style.display = 'inline'; document.getElementById('2407.01265v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01265v1-abstract-full" style="display: none;"> Action spotting is crucial in sports analytics as it enables the precise identification and categorization of pivotal moments in sports matches, providing insights that are essential for performance analysis and tactical decision-making. The fragmentation of existing methodologies, however, impedes the progression of sports analytics, necessitating a unified codebase to support the development and deployment of action spotting for video analysis. In this work, we introduce OSL-ActionSpotting, a Python library that unifies different action spotting algorithms to streamline research and applications in sports video analytics. OSL-ActionSpotting encapsulates various state-of-the-art techniques into a singular, user-friendly framework, offering standardized processes for action spotting and analysis across multiple datasets. We successfully integrated three cornerstone action spotting methods into OSL-ActionSpotting, achieving performance metrics that match those of the original, disparate codebases. This unification within a single library preserves the effectiveness of each method and enhances usability and accessibility for researchers and practitioners in sports analytics. By bridging the gaps between various action spotting techniques, OSL-ActionSpotting significantly contributes to the field of sports video analysis, fostering enhanced analytical capabilities and collaborative research opportunities. The scalable and modularized design of the library ensures its long-term relevance and adaptability to future technological advancements in the domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01265v1-abstract-full').style.display = 'none'; document.getElementById('2407.01265v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14563">arXiv:2406.14563</a> <span> [<a href="https://arxiv.org/pdf/2406.14563">pdf</a>, <a href="https://arxiv.org/format/2406.14563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Model Merging and Safety Alignment: One Bad Model Spoils the Bunch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hammoud%2C+H+A+A+K">Hasan Abed Al Kader Hammoud</a>, <a href="/search/cs?searchtype=author&query=Michieli%2C+U">Umberto Michieli</a>, <a href="/search/cs?searchtype=author&query=Pizzati%2C+F">Fabio Pizzati</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Bibi%2C+A">Adel Bibi</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Ozay%2C+M">Mete Ozay</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14563v1-abstract-short" style="display: inline;"> Merging Large Language Models (LLMs) is a cost-effective technique for combining multiple expert LLMs into a single versatile model, retaining the expertise of the original ones. However, current approaches often overlook the importance of safety alignment during merging, leading to highly misaligned models. This work investigates the effects of model merging on alignment. We evaluate several popu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14563v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14563v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14563v1-abstract-full" style="display: none;"> Merging Large Language Models (LLMs) is a cost-effective technique for combining multiple expert LLMs into a single versatile model, retaining the expertise of the original ones. However, current approaches often overlook the importance of safety alignment during merging, leading to highly misaligned models. This work investigates the effects of model merging on alignment. We evaluate several popular model merging techniques, demonstrating that existing methods do not only transfer domain expertise but also propagate misalignment. We propose a simple two-step approach to address this problem: (i) generating synthetic safety and domain-specific data, and (ii) incorporating these generated data into the optimization process of existing data-aware model merging techniques. This allows us to treat alignment as a skill that can be maximized in the resulting merged LLM. Our experiments illustrate the effectiveness of integrating alignment-related data during merging, resulting in models that excel in both domain expertise and alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14563v1-abstract-full').style.display = 'none'; document.getElementById('2406.14563v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08659">arXiv:2406.08659</a> <span> [<a href="https://arxiv.org/pdf/2406.08659">pdf</a>, <a href="https://arxiv.org/format/2406.08659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Vivid-ZOO: Multi-View Video Generation with Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bing Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Cheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenxuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Mai%2C+J">Jinjie Mai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Biao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wonka%2C+P">Peter Wonka</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08659v1-abstract-short" style="display: inline;"> While diffusion models have shown impressive performance in 2D image/video generation, diffusion-based Text-to-Multi-view-Video (T2MVid) generation remains underexplored. The new challenges posed by T2MVid generation lie in the lack of massive captioned multi-view videos and the complexity of modeling such multi-dimensional distribution. To this end, we propose a novel diffusion-based pipeline tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08659v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08659v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08659v1-abstract-full" style="display: none;"> While diffusion models have shown impressive performance in 2D image/video generation, diffusion-based Text-to-Multi-view-Video (T2MVid) generation remains underexplored. The new challenges posed by T2MVid generation lie in the lack of massive captioned multi-view videos and the complexity of modeling such multi-dimensional distribution. To this end, we propose a novel diffusion-based pipeline that generates high-quality multi-view videos centered around a dynamic 3D object from text. Specifically, we factor the T2MVid problem into viewpoint-space and time components. Such factorization allows us to combine and reuse layers of advanced pre-trained multi-view image and 2D video diffusion models to ensure multi-view consistency as well as temporal coherence for the generated multi-view videos, largely reducing the training cost. We further introduce alignment modules to align the latent spaces of layers from the pre-trained multi-view and the 2D video diffusion models, addressing the reused layers' incompatibility that arises from the domain gap between 2D and multi-view data. In support of this and future research, we further contribute a captioned multi-view video dataset. Experimental results demonstrate that our method generates high-quality multi-view videos, exhibiting vivid motions, temporal coherence, and multi-view consistency, given a variety of text prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08659v1-abstract-full').style.display = 'none'; document.getElementById('2406.08659v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our project page is at https://hi-zhengcheng.github.io/vividzoo/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05223">arXiv:2406.05223</a> <span> [<a href="https://arxiv.org/pdf/2406.05223">pdf</a>, <a href="https://arxiv.org/format/2406.05223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CorDA: Context-Oriented Decomposition Adaptation of Large Language Models for Task-Aware Parameter-Efficient Fine-tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yibo Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaojie Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhongzhu Zhou</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S+L">Shuaiwen Leon Song</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianlong Wu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+L">Liqiang Nie</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05223v2-abstract-short" style="display: inline;"> Current parameter-efficient fine-tuning (PEFT) methods build adapters widely agnostic of the context of downstream task to learn, or the context of important knowledge to maintain. As a result, there is often a performance gap compared to full-parameter fine-tuning, and meanwhile the fine-tuned model suffers from catastrophic forgetting of the pre-trained world knowledge. In this paper, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05223v2-abstract-full').style.display = 'inline'; document.getElementById('2406.05223v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05223v2-abstract-full" style="display: none;"> Current parameter-efficient fine-tuning (PEFT) methods build adapters widely agnostic of the context of downstream task to learn, or the context of important knowledge to maintain. As a result, there is often a performance gap compared to full-parameter fine-tuning, and meanwhile the fine-tuned model suffers from catastrophic forgetting of the pre-trained world knowledge. In this paper, we propose CorDA, a Context-oriented Decomposition Adaptation method that builds learnable task-aware adapters from weight decomposition oriented by the context of downstream task or the world knowledge to maintain. Concretely, we collect a few data samples, and perform singular value decomposition for each linear layer of a pre-trained LLM multiplied by the covariance matrix of the input activation using these samples. The inverse of the covariance matrix is multiplied with the decomposed components to reconstruct the original weights. By doing so, the context of the representative samples is captured through deciding the factorizing orientation. Our method enables two options, the knowledge-preserved adaptation and the instruction-previewed adaptation. For the former, we use question-answering samples to obtain the covariance matrices, and use the decomposed components with the smallest $r$ singular values to initialize a learnable adapter, with the others frozen such that the world knowledge is better preserved. For the latter, we use the instruction data from the fine-tuning task, such as math or coding, to orientate the decomposition and train the largest $r$ components that most correspond to the task to learn. We conduct extensive experiments on Math, Code, and Instruction Following tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05223v2-abstract-full').style.display = 'none'; document.getElementById('2406.05223v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05222">arXiv:2406.05222</a> <span> [<a href="https://arxiv.org/pdf/2406.05222">pdf</a>, <a href="https://arxiv.org/format/2406.05222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Towards Interpretable Deep Local Learning with Successive Gradient Reconciliation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yibo Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaojie Li</a>, <a href="/search/cs?searchtype=author&query=Alfarra%2C+M">Motasem Alfarra</a>, <a href="/search/cs?searchtype=author&query=Hammoud%2C+H">Hasan Hammoud</a>, <a href="/search/cs?searchtype=author&query=Bibi%2C+A">Adel Bibi</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05222v1-abstract-short" style="display: inline;"> Relieving the reliance of neural network training on a global back-propagation (BP) has emerged as a notable research topic due to the biological implausibility and huge memory consumption caused by BP. Among the existing solutions, local learning optimizes gradient-isolated modules of a neural network with local errors and has been proved to be effective even on large-scale datasets. However, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05222v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05222v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05222v1-abstract-full" style="display: none;"> Relieving the reliance of neural network training on a global back-propagation (BP) has emerged as a notable research topic due to the biological implausibility and huge memory consumption caused by BP. Among the existing solutions, local learning optimizes gradient-isolated modules of a neural network with local errors and has been proved to be effective even on large-scale datasets. However, the reconciliation among local errors has never been investigated. In this paper, we first theoretically study non-greedy layer-wise training and show that the convergence cannot be assured when the local gradient in a module w.r.t. its input is not reconciled with the local gradient in the previous module w.r.t. its output. Inspired by the theoretical result, we further propose a local training strategy that successively regularizes the gradient reconciliation between neighboring modules without breaking gradient isolation or introducing any learnable parameters. Our method can be integrated into both local-BP and BP-free settings. In experiments, we achieve significant performance improvements compared to previous methods. Particularly, our method for CNN and Transformer architectures on ImageNet is able to attain a competitive performance with global BP, saving more than 40% memory consumption. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05222v1-abstract-full').style.display = 'none'; document.getElementById('2406.05222v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17146">arXiv:2405.17146</a> <span> [<a href="https://arxiv.org/pdf/2405.17146">pdf</a>, <a href="https://arxiv.org/format/2405.17146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Compressed-Language Models for Understanding Compressed File Formats: a JPEG Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&query=Pardo%2C+A">Alejandro Pardo</a>, <a href="/search/cs?searchtype=author&query=Soldan%2C+M">Mattia Soldan</a>, <a href="/search/cs?searchtype=author&query=Itani%2C+H">Hani Itani</a>, <a href="/search/cs?searchtype=author&query=Leon-Alcazar%2C+J">Juan Leon-Alcazar</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17146v1-abstract-short" style="display: inline;"> This study investigates whether Compressed-Language Models (CLMs), i.e. language models operating on raw byte streams from Compressed File Formats~(CFFs), can understand files compressed by CFFs. We focus on the JPEG format as a representative CFF, given its commonality and its representativeness of key concepts in compression, such as entropy coding and run-length encoding. We test if CLMs unders… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17146v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17146v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17146v1-abstract-full" style="display: none;"> This study investigates whether Compressed-Language Models (CLMs), i.e. language models operating on raw byte streams from Compressed File Formats~(CFFs), can understand files compressed by CFFs. We focus on the JPEG format as a representative CFF, given its commonality and its representativeness of key concepts in compression, such as entropy coding and run-length encoding. We test if CLMs understand the JPEG format by probing their capabilities to perform along three axes: recognition of inherent file properties, handling of files with anomalies, and generation of new files. Our findings demonstrate that CLMs can effectively perform these tasks. These results suggest that CLMs can understand the semantics of compressed data when directly operating on the byte streams of files produced by CFFs. The possibility to directly operate on raw compressed files offers the promise to leverage some of their remarkable characteristics, such as their ubiquity, compactness, multi-modality and segment-nature. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17146v1-abstract-full').style.display = 'none'; document.getElementById('2405.17146v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00466">arXiv:2405.00466</a> <span> [<a href="https://arxiv.org/pdf/2405.00466">pdf</a>, <a href="https://arxiv.org/format/2405.00466">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Lazy Layers to Make Fine-Tuned Diffusion Models More Traceable </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haozhe Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentian Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bing Li</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Schmidhuber%2C+J">J眉rgen Schmidhuber</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00466v1-abstract-short" style="display: inline;"> Foundational generative models should be traceable to protect their owners and facilitate safety regulation. To achieve this, traditional approaches embed identifiers based on supervisory trigger-response signals, which are commonly known as backdoor watermarks. They are prone to failure when the model is fine-tuned with nontrigger data. Our experiments show that this vulnerability is due to energ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00466v1-abstract-full').style.display = 'inline'; document.getElementById('2405.00466v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00466v1-abstract-full" style="display: none;"> Foundational generative models should be traceable to protect their owners and facilitate safety regulation. To achieve this, traditional approaches embed identifiers based on supervisory trigger-response signals, which are commonly known as backdoor watermarks. They are prone to failure when the model is fine-tuned with nontrigger data. Our experiments show that this vulnerability is due to energetic changes in only a few 'busy' layers during fine-tuning. This yields a novel arbitrary-in-arbitrary-out (AIAO) strategy that makes watermarks resilient to fine-tuning-based removal. The trigger-response pairs of AIAO samples across various neural network depths can be used to construct watermarked subpaths, employing Monte Carlo sampling to achieve stable verification results. In addition, unlike the existing methods of designing a backdoor for the input/output space of diffusion models, in our method, we propose to embed the backdoor into the feature space of sampled subpaths, where a mask-controlled trigger function is proposed to preserve the generation performance and ensure the invisibility of the embedded backdoor. Our empirical studies on the MS-COCO, AFHQ, LSUN, CUB-200, and DreamBooth datasets confirm the robustness of AIAO; while the verification rates of other trigger-based methods fall from ~90% to ~70% after fine-tuning, those of our method remain consistently above 90%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00466v1-abstract-full').style.display = 'none'; document.getElementById('2405.00466v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17930">arXiv:2404.17930</a> <span> [<a href="https://arxiv.org/pdf/2404.17930">pdf</a>, <a href="https://arxiv.org/format/2404.17930">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Stream Cellular Test-Time Adaptation of Real-Time Models Evolving in Dynamic Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=G%C3%A9rin%2C+B">Beno卯t G茅rin</a>, <a href="/search/cs?searchtype=author&query=Halin%2C+A">Ana茂s Halin</a>, <a href="/search/cs?searchtype=author&query=Cioppa%2C+A">Anthony Cioppa</a>, <a href="/search/cs?searchtype=author&query=Henry%2C+M">Maxim Henry</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Macq%2C+B">Beno卯t Macq</a>, <a href="/search/cs?searchtype=author&query=De+Vleeschouwer%2C+C">Christophe De Vleeschouwer</a>, <a href="/search/cs?searchtype=author&query=Van+Droogenbroeck%2C+M">Marc Van Droogenbroeck</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17930v1-abstract-short" style="display: inline;"> In the era of the Internet of Things (IoT), objects connect through a dynamic network, empowered by technologies like 5G, enabling real-time data sharing. However, smart objects, notably autonomous vehicles, face challenges in critical local computations due to limited resources. Lightweight AI models offer a solution but struggle with diverse data distributions. To address this limitation, we pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17930v1-abstract-full').style.display = 'inline'; document.getElementById('2404.17930v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17930v1-abstract-full" style="display: none;"> In the era of the Internet of Things (IoT), objects connect through a dynamic network, empowered by technologies like 5G, enabling real-time data sharing. However, smart objects, notably autonomous vehicles, face challenges in critical local computations due to limited resources. Lightweight AI models offer a solution but struggle with diverse data distributions. To address this limitation, we propose a novel Multi-Stream Cellular Test-Time Adaptation (MSC-TTA) setup where models adapt on the fly to a dynamic environment divided into cells. Then, we propose a real-time adaptive student-teacher method that leverages the multiple streams available in each cell to quickly adapt to changing data distributions. We validate our methodology in the context of autonomous vehicles navigating across cells defined based on location and weather conditions. To facilitate future benchmarking, we release a new multi-stream large-scale synthetic semantic segmentation dataset, called DADE, and show that our multi-stream approach outperforms a single-stream baseline. We believe that our work will open research opportunities in the IoT and 5G eras, offering solutions for real-time model adaptation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17930v1-abstract-full').style.display = 'none'; document.getElementById('2404.17930v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15161">arXiv:2404.15161</a> <span> [<a href="https://arxiv.org/pdf/2404.15161">pdf</a>, <a href="https://arxiv.org/format/2404.15161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Combating Missing Modalities in Egocentric Videos at Test Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ramazanova%2C+M">Merey Ramazanova</a>, <a href="/search/cs?searchtype=author&query=Pardo%2C+A">Alejandro Pardo</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Alfarra%2C+M">Motasem Alfarra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15161v1-abstract-short" style="display: inline;"> Understanding videos that contain multiple modalities is crucial, especially in egocentric videos, where combining various sensory inputs significantly improves tasks like action recognition and moment localization. However, real-world applications often face challenges with incomplete modalities due to privacy concerns, efficiency needs, or hardware issues. Current methods, while effective, often… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15161v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15161v1-abstract-full" style="display: none;"> Understanding videos that contain multiple modalities is crucial, especially in egocentric videos, where combining various sensory inputs significantly improves tasks like action recognition and moment localization. However, real-world applications often face challenges with incomplete modalities due to privacy concerns, efficiency needs, or hardware issues. Current methods, while effective, often necessitate retraining the model entirely to handle missing modalities, making them computationally intensive, particularly with large training datasets. In this study, we propose a novel approach to address this issue at test time without requiring retraining. We frame the problem as a test-time adaptation task, where the model adjusts to the available unlabeled data at test time. Our method, MiDl~(Mutual information with self-Distillation), encourages the model to be insensitive to the specific modality source present during testing by minimizing the mutual information between the prediction and the available modality. Additionally, we incorporate self-distillation to maintain the model's original performance when both modalities are available. MiDl represents the first self-supervised, online solution for handling missing modalities exclusively at test time. Through experiments with various pretrained models and datasets, MiDl demonstrates substantial performance improvement without the need for retraining. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15161v1-abstract-full').style.display = 'none'; document.getElementById('2404.15161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.12766">arXiv:2404.12766</a> <span> [<a href="https://arxiv.org/pdf/2404.12766">pdf</a>, <a href="https://arxiv.org/format/2404.12766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Continual Learning on a Diet: Learning from Sparsely Labeled Streams Under Constrained Computation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Mohamed%2C+Y">Youssef Mohamed</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P+H+S">Philip H. S. Torr</a>, <a href="/search/cs?searchtype=author&query=Bibi%2C+A">Adel Bibi</a>, <a href="/search/cs?searchtype=author&query=Elhoseiny%2C+M">Mohamed Elhoseiny</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.12766v2-abstract-short" style="display: inline;"> We propose and study a realistic Continual Learning (CL) setting where learning algorithms are granted a restricted computational budget per time step while training. We apply this setting to large-scale semi-supervised Continual Learning scenarios with sparse label rates. Previous proficient CL methods perform very poorly in this challenging setting. Overfitting to the sparse labeled data and ins… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12766v2-abstract-full').style.display = 'inline'; document.getElementById('2404.12766v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.12766v2-abstract-full" style="display: none;"> We propose and study a realistic Continual Learning (CL) setting where learning algorithms are granted a restricted computational budget per time step while training. We apply this setting to large-scale semi-supervised Continual Learning scenarios with sparse label rates. Previous proficient CL methods perform very poorly in this challenging setting. Overfitting to the sparse labeled data and insufficient computational budget are the two main culprits for such a poor performance. Our new setting encourages learning methods to effectively and efficiently utilize the unlabeled data during training. To that end, we propose a simple but highly effective baseline, DietCL, which utilizes both unlabeled and labeled data jointly. DietCL meticulously allocates computational budget for both types of data. We validate our baseline, at scale, on several datasets, e.g., CLOC, ImageNet10K, and CGLM, under constraint budget setups. DietCL outperforms, by a large margin, all existing supervised CL algorithms as well as more recent continual semi-supervised methods. Our extensive analysis and ablations demonstrate that DietCL is stable under a full spectrum of label sparsity, computational budget, and various other ablations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12766v2-abstract-full').style.display = 'none'; document.getElementById('2404.12766v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.11335">arXiv:2404.11335</a> <span> [<a href="https://arxiv.org/pdf/2404.11335">pdf</a>, <a href="https://arxiv.org/format/2404.11335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SoccerNet Game State Reconstruction: End-to-End Athlete Tracking and Identification on a Minimap </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Somers%2C+V">Vladimir Somers</a>, <a href="/search/cs?searchtype=author&query=Joos%2C+V">Victor Joos</a>, <a href="/search/cs?searchtype=author&query=Cioppa%2C+A">Anthony Cioppa</a>, <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Ghasemzadeh%2C+S+A">Seyed Abolfazl Ghasemzadeh</a>, <a href="/search/cs?searchtype=author&query=Magera%2C+F">Floriane Magera</a>, <a href="/search/cs?searchtype=author&query=Standaert%2C+B">Baptiste Standaert</a>, <a href="/search/cs?searchtype=author&query=Mansourian%2C+A+M">Amir Mohammad Mansourian</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xin Zhou</a>, <a href="/search/cs?searchtype=author&query=Kasaei%2C+S">Shohreh Kasaei</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Alahi%2C+A">Alexandre Alahi</a>, <a href="/search/cs?searchtype=author&query=Van+Droogenbroeck%2C+M">Marc Van Droogenbroeck</a>, <a href="/search/cs?searchtype=author&query=De+Vleeschouwer%2C+C">Christophe De Vleeschouwer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.11335v1-abstract-short" style="display: inline;"> Tracking and identifying athletes on the pitch holds a central role in collecting essential insights from the game, such as estimating the total distance covered by players or understanding team tactics. This tracking and identification process is crucial for reconstructing the game state, defined by the athletes' positions and identities on a 2D top-view of the pitch, (i.e. a minimap). However, r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.11335v1-abstract-full').style.display = 'inline'; document.getElementById('2404.11335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.11335v1-abstract-full" style="display: none;"> Tracking and identifying athletes on the pitch holds a central role in collecting essential insights from the game, such as estimating the total distance covered by players or understanding team tactics. This tracking and identification process is crucial for reconstructing the game state, defined by the athletes' positions and identities on a 2D top-view of the pitch, (i.e. a minimap). However, reconstructing the game state from videos captured by a single camera is challenging. It requires understanding the position of the athletes and the viewpoint of the camera to localize and identify players within the field. In this work, we formalize the task of Game State Reconstruction and introduce SoccerNet-GSR, a novel Game State Reconstruction dataset focusing on football videos. SoccerNet-GSR is composed of 200 video sequences of 30 seconds, annotated with 9.37 million line points for pitch localization and camera calibration, as well as over 2.36 million athlete positions on the pitch with their respective role, team, and jersey number. Furthermore, we introduce GS-HOTA, a novel metric to evaluate game state reconstruction methods. Finally, we propose and release an end-to-end baseline for game state reconstruction, bootstrapping the research on this task. Our experiments show that GSR is a challenging novel task, which opens the field for future research. Our dataset and codebase are publicly available at https://github.com/SoccerNet/sn-gamestate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.11335v1-abstract-full').style.display = 'none'; document.getElementById('2404.11335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2024 IEEE/CVF Conf. Comput. Vis. Pattern Recognit. Work. (CVPRW) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06332">arXiv:2404.06332</a> <span> [<a href="https://arxiv.org/pdf/2404.06332">pdf</a>, <a href="https://arxiv.org/format/2404.06332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> X-VARS: Introducing Explainability in Football Refereeing with Multi-Modal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Held%2C+J">Jan Held</a>, <a href="/search/cs?searchtype=author&query=Itani%2C+H">Hani Itani</a>, <a href="/search/cs?searchtype=author&query=Cioppa%2C+A">Anthony Cioppa</a>, <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Van+Droogenbroeck%2C+M">Marc Van Droogenbroeck</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06332v1-abstract-short" style="display: inline;"> The rapid advancement of artificial intelligence has led to significant improvements in automated decision-making. However, the increased performance of models often comes at the cost of explainability and transparency of their decision-making processes. In this paper, we investigate the capabilities of large language models to explain decisions, using football refereeing as a testing ground, give… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06332v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06332v1-abstract-full" style="display: none;"> The rapid advancement of artificial intelligence has led to significant improvements in automated decision-making. However, the increased performance of models often comes at the cost of explainability and transparency of their decision-making processes. In this paper, we investigate the capabilities of large language models to explain decisions, using football refereeing as a testing ground, given its decision complexity and subjectivity. We introduce the Explainable Video Assistant Referee System, X-VARS, a multi-modal large language model designed for understanding football videos from the point of view of a referee. X-VARS can perform a multitude of tasks, including video description, question answering, action recognition, and conducting meaningful conversations based on video content and in accordance with the Laws of the Game for football referees. We validate X-VARS on our novel dataset, SoccerNet-XFoul, which consists of more than 22k video-question-answer triplets annotated by over 70 experienced football referees. Our experiments and human study illustrate the impressive capabilities of X-VARS in interpreting complex football clips. Furthermore, we highlight the potential of X-VARS to reach human performance and support football referees in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06332v1-abstract-full').style.display = 'none'; document.getElementById('2404.06332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04526">arXiv:2404.04526</a> <span> [<a href="https://arxiv.org/pdf/2404.04526">pdf</a>, <a href="https://arxiv.org/format/2404.04526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DATENeRF: Depth-Aware Text-based Editing of NeRFs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rojas%2C+S">Sara Rojas</a>, <a href="/search/cs?searchtype=author&query=Philip%2C+J">Julien Philip</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Sunkavall%2C+K">Kalyan Sunkavall</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04526v2-abstract-short" style="display: inline;"> Recent advancements in diffusion models have shown remarkable proficiency in editing 2D images based on text prompts. However, extending these techniques to edit scenes in Neural Radiance Fields (NeRF) is complex, as editing individual 2D frames can result in inconsistencies across multiple views. Our crucial insight is that a NeRF scene's geometry can serve as a bridge to integrate these 2D edits… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04526v2-abstract-full').style.display = 'inline'; document.getElementById('2404.04526v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04526v2-abstract-full" style="display: none;"> Recent advancements in diffusion models have shown remarkable proficiency in editing 2D images based on text prompts. However, extending these techniques to edit scenes in Neural Radiance Fields (NeRF) is complex, as editing individual 2D frames can result in inconsistencies across multiple views. Our crucial insight is that a NeRF scene's geometry can serve as a bridge to integrate these 2D edits. Utilizing this geometry, we employ a depth-conditioned ControlNet to enhance the coherence of each 2D image modification. Moreover, we introduce an inpainting approach that leverages the depth information of NeRF scenes to distribute 2D edits across different images, ensuring robustness against errors and resampling challenges. Our results reveal that this methodology achieves more consistent, lifelike, and detailed edits than existing leading methods for text-driven NeRF scene editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04526v2-abstract-full').style.display = 'none'; document.getElementById('2404.04526v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3D Scene Editing, Neural Rendering, Diffusion Models, Accepted to ECCV24</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ECCV 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03477">arXiv:2404.03477</a> <span> [<a href="https://arxiv.org/pdf/2404.03477">pdf</a>, <a href="https://arxiv.org/format/2404.03477">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Automated Movie Trailer Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Argaw%2C+D+M">Dawit Mureja Argaw</a>, <a href="/search/cs?searchtype=author&query=Soldan%2C+M">Mattia Soldan</a>, <a href="/search/cs?searchtype=author&query=Pardo%2C+A">Alejandro Pardo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Heilbron%2C+F+C">Fabian Caba Heilbron</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03477v1-abstract-short" style="display: inline;"> Movie trailers are an essential tool for promoting films and attracting audiences. However, the process of creating trailers can be time-consuming and expensive. To streamline this process, we propose an automatic trailer generation framework that generates plausible trailers from a full movie by automating shot selection and composition. Our approach draws inspiration from machine translation tec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03477v1-abstract-full').style.display = 'inline'; document.getElementById('2404.03477v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03477v1-abstract-full" style="display: none;"> Movie trailers are an essential tool for promoting films and attracting audiences. However, the process of creating trailers can be time-consuming and expensive. To streamline this process, we propose an automatic trailer generation framework that generates plausible trailers from a full movie by automating shot selection and composition. Our approach draws inspiration from machine translation techniques and models the movies and trailers as sequences of shots, thus formulating the trailer generation problem as a sequence-to-sequence task. We introduce Trailer Generation Transformer (TGT), a deep-learning framework utilizing an encoder-decoder architecture. TGT movie encoder is tasked with contextualizing each movie shot representation via self-attention, while the autoregressive trailer decoder predicts the feature representation of the next trailer shot, accounting for the relevance of shots' temporal order in trailers. Our TGT significantly outperforms previous methods on a comprehensive suite of metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03477v1-abstract-full').style.display = 'none'; document.getElementById('2404.03477v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00777">arXiv:2404.00777</a> <span> [<a href="https://arxiv.org/pdf/2404.00777">pdf</a>, <a href="https://arxiv.org/format/2404.00777">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Privacy-preserving Optics for Enhancing Protection in Face De-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lopez%2C+J">Jhon Lopez</a>, <a href="/search/cs?searchtype=author&query=Hinojosa%2C+C">Carlos Hinojosa</a>, <a href="/search/cs?searchtype=author&query=Arguello%2C+H">Henry Arguello</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00777v1-abstract-short" style="display: inline;"> The modern surge in camera usage alongside widespread computer vision technology applications poses significant privacy and security concerns. Current artificial intelligence (AI) technologies aid in recognizing relevant events and assisting in daily tasks in homes, offices, hospitals, etc. The need to access or process personal information for these purposes raises privacy concerns. While softwar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00777v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00777v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00777v1-abstract-full" style="display: none;"> The modern surge in camera usage alongside widespread computer vision technology applications poses significant privacy and security concerns. Current artificial intelligence (AI) technologies aid in recognizing relevant events and assisting in daily tasks in homes, offices, hospitals, etc. The need to access or process personal information for these purposes raises privacy concerns. While software-level solutions like face de-identification provide a good privacy/utility trade-off, they present vulnerabilities to sniffing attacks. In this paper, we propose a hardware-level face de-identification method to solve this vulnerability. Specifically, our approach first learns an optical encoder along with a regression model to obtain a face heatmap while hiding the face identity from the source image. We also propose an anonymization framework that generates a new face using the privacy-preserving image, face heatmap, and a reference face image from a public dataset as input. We validate our approach with extensive simulations and hardware experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00777v1-abstract-full').style.display = 'none'; document.getElementById('2404.00777v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024. Project Website and Code coming soon</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17823">arXiv:2403.17823</a> <span> [<a href="https://arxiv.org/pdf/2403.17823">pdf</a>, <a href="https://arxiv.org/format/2403.17823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Image Pre-Training with Siamese Cropped Masked Autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Eyma%C3%ABl%2C+A">Alexandre Eyma毛l</a>, <a href="/search/cs?searchtype=author&query=Vandeghen%2C+R">Renaud Vandeghen</a>, <a href="/search/cs?searchtype=author&query=Cioppa%2C+A">Anthony Cioppa</a>, <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Van+Droogenbroeck%2C+M">Marc Van Droogenbroeck</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17823v2-abstract-short" style="display: inline;"> Self-supervised pre-training of image encoders is omnipresent in the literature, particularly following the introduction of Masked autoencoders (MAE). Current efforts attempt to learn object-centric representations from motion in videos. In particular, SiamMAE recently introduced a Siamese network, training a shared-weight encoder from two frames of a video with a high asymmetric masking ratio (95… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17823v2-abstract-full').style.display = 'inline'; document.getElementById('2403.17823v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17823v2-abstract-full" style="display: none;"> Self-supervised pre-training of image encoders is omnipresent in the literature, particularly following the introduction of Masked autoencoders (MAE). Current efforts attempt to learn object-centric representations from motion in videos. In particular, SiamMAE recently introduced a Siamese network, training a shared-weight encoder from two frames of a video with a high asymmetric masking ratio (95%). In this work, we propose CropMAE, an alternative approach to the Siamese pre-training introduced by SiamMAE. Our method specifically differs by exclusively considering pairs of cropped images sourced from the same image but cropped differently, deviating from the conventional pairs of frames extracted from a video. CropMAE therefore alleviates the need for video datasets, while maintaining competitive performances and drastically reducing pre-training and learning time. Furthermore, we demonstrate that CropMAE learns similar object-centric representations without explicit motion, showing that current self-supervised learning methods do not learn such representations from explicit object motion, but rather thanks to the implicit image transformations that occur between the two views. Finally, CropMAE achieves the highest masking ratio to date (98.5%), enabling the reconstruction of images using only two visible patches. Our code is available at https://github.com/alexandre-eymael/CropMAE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17823v2-abstract-full').style.display = 'none'; document.getElementById('2403.17823v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 7 figures, 5 tables, 3 pages of supplementary material. Paper accepted at ECCV 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13808">arXiv:2403.13808</a> <span> [<a href="https://arxiv.org/pdf/2403.13808">pdf</a>, <a href="https://arxiv.org/format/2403.13808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On Pretraining Data Diversity for Self-Supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hammoud%2C+H+A+A+K">Hasan Abed Al Kader Hammoud</a>, <a href="/search/cs?searchtype=author&query=Das%2C+T">Tuhin Das</a>, <a href="/search/cs?searchtype=author&query=Pizzati%2C+F">Fabio Pizzati</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Bibi%2C+A">Adel Bibi</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13808v3-abstract-short" style="display: inline;"> We explore the impact of training with more diverse datasets, characterized by the number of unique samples, on the performance of self-supervised learning (SSL) under a fixed computational budget. Our findings consistently demonstrate that increasing pretraining data diversity enhances SSL performance, albeit only when the distribution distance to the downstream data is minimal. Notably, even wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13808v3-abstract-full').style.display = 'inline'; document.getElementById('2403.13808v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13808v3-abstract-full" style="display: none;"> We explore the impact of training with more diverse datasets, characterized by the number of unique samples, on the performance of self-supervised learning (SSL) under a fixed computational budget. Our findings consistently demonstrate that increasing pretraining data diversity enhances SSL performance, albeit only when the distribution distance to the downstream data is minimal. Notably, even with an exceptionally large pretraining data diversity achieved through methods like web crawling or diffusion-generated data, among other ways, the distribution shift remains a challenge. Our experiments are comprehensive with seven SSL methods using large-scale datasets such as ImageNet and YFCC100M amounting to over 200 GPU days. Code and trained models are available at https://github.com/hammoudhasan/DiversitySSL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13808v3-abstract-full').style.display = 'none'; document.getElementById('2403.13808v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12003">arXiv:2403.12003</a> <span> [<a href="https://arxiv.org/pdf/2403.12003">pdf</a>, <a href="https://arxiv.org/format/2403.12003">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GenView: Enhancing View Quality with Pretrained Generative Model for Self-Supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaojie Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yibo Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianlong Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yue Yu</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12003v1-abstract-short" style="display: inline;"> Self-supervised learning has achieved remarkable success in acquiring high-quality representations from unlabeled data. The widely adopted contrastive learning framework aims to learn invariant representations by minimizing the distance between positive views originating from the same image. However, existing techniques to construct positive views highly rely on manual transformations, resulting i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12003v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12003v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12003v1-abstract-full" style="display: none;"> Self-supervised learning has achieved remarkable success in acquiring high-quality representations from unlabeled data. The widely adopted contrastive learning framework aims to learn invariant representations by minimizing the distance between positive views originating from the same image. However, existing techniques to construct positive views highly rely on manual transformations, resulting in limited diversity and potentially false positive pairs. To tackle these challenges, we present GenView, a controllable framework that augments the diversity of positive views leveraging the power of pretrained generative models while preserving semantics. We develop an adaptive view generation method that dynamically adjusts the noise level in sampling to ensure the preservation of essential semantic meaning while introducing variability. Additionally, we introduce a quality-driven contrastive loss, which assesses the quality of positive pairs by considering both foreground similarity and background diversity. This loss prioritizes the high-quality positive pairs we construct while reducing the influence of low-quality pairs, thereby mitigating potential semantic inconsistencies introduced by generative models and aggressive data augmentation. Thanks to the improved positive view quality and the quality-driven contrastive loss, GenView significantly improves self-supervised learning across various tasks. For instance, GenView improves MoCov2 performance by 2.5%/2.2% on ImageNet linear/semi-supervised classification. Moreover, GenView even performs much better than naively augmenting the ImageNet dataset with Laion400M or ImageNet21K. Code is available at https://github.com/xiaojieli0903/genview. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12003v1-abstract-full').style.display = 'none'; document.getElementById('2403.12003v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/xiaojieli0903/genview</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.10128">arXiv:2402.10128</a> <span> [<a href="https://arxiv.org/pdf/2402.10128">pdf</a>, <a href="https://arxiv.org/format/2402.10128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GES: Generalized Exponential Splatting for Efficient Radiance Field Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hamdi%2C+A">Abdullah Hamdi</a>, <a href="/search/cs?searchtype=author&query=Melas-Kyriazi%2C+L">Luke Melas-Kyriazi</a>, <a href="/search/cs?searchtype=author&query=Mai%2C+J">Jinjie Mai</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+G">Guocheng Qian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruoshi Liu</a>, <a href="/search/cs?searchtype=author&query=Vondrick%2C+C">Carl Vondrick</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Vedaldi%2C+A">Andrea Vedaldi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.10128v2-abstract-short" style="display: inline;"> Advancements in 3D Gaussian Splatting have significantly accelerated 3D reconstruction and generation. However, it may require a large number of Gaussians, which creates a substantial memory footprint. This paper introduces GES (Generalized Exponential Splatting), a novel representation that employs Generalized Exponential Function (GEF) to model 3D scenes, requiring far fewer particles to represe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10128v2-abstract-full').style.display = 'inline'; document.getElementById('2402.10128v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.10128v2-abstract-full" style="display: none;"> Advancements in 3D Gaussian Splatting have significantly accelerated 3D reconstruction and generation. However, it may require a large number of Gaussians, which creates a substantial memory footprint. This paper introduces GES (Generalized Exponential Splatting), a novel representation that employs Generalized Exponential Function (GEF) to model 3D scenes, requiring far fewer particles to represent a scene and thus significantly outperforming Gaussian Splatting methods in efficiency with a plug-and-play replacement ability for Gaussian-based utilities. GES is validated theoretically and empirically in both principled 1D setup and realistic 3D scenes. It is shown to represent signals with sharp edges more accurately, which are typically challenging for Gaussians due to their inherent low-pass characteristics. Our empirical analysis demonstrates that GEF outperforms Gaussians in fitting natural-occurring signals (e.g. squares, triangles, and parabolic signals), thereby reducing the need for extensive splitting operations that increase the memory footprint of Gaussian Splatting. With the aid of a frequency-modulated loss, GES achieves competitive performance in novel-view synthesis benchmarks while requiring less than half the memory storage of Gaussian Splatting and increasing the rendering speed by up to 39%. The code is available on the project website https://abdullahamdi.com/ges . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10128v2-abstract-full').style.display = 'none'; document.getElementById('2402.10128v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024 paper. project website https://abdullahamdi.com/ges</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05235">arXiv:2402.05235</a> <span> [<a href="https://arxiv.org/pdf/2402.05235">pdf</a>, <a href="https://arxiv.org/format/2402.05235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SPAD : Spatially Aware Multiview Diffusers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Ziyi Wu</a>, <a href="/search/cs?searchtype=author&query=Vasilkovsky%2C+M">Michael Vasilkovsky</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+G">Guocheng Qian</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jian Ren</a>, <a href="/search/cs?searchtype=author&query=Guler%2C+R+A">Riza Alp Guler</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Tulyakov%2C+S">Sergey Tulyakov</a>, <a href="/search/cs?searchtype=author&query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&query=Siarohin%2C+A">Aliaksandr Siarohin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05235v1-abstract-short" style="display: inline;"> We present SPAD, a novel approach for creating consistent multi-view images from text prompts or single images. To enable multi-view generation, we repurpose a pretrained 2D diffusion model by extending its self-attention layers with cross-view interactions, and fine-tune it on a high quality subset of Objaverse. We find that a naive extension of the self-attention proposed in prior work (e.g. MVD… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05235v1-abstract-full').style.display = 'inline'; document.getElementById('2402.05235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05235v1-abstract-full" style="display: none;"> We present SPAD, a novel approach for creating consistent multi-view images from text prompts or single images. To enable multi-view generation, we repurpose a pretrained 2D diffusion model by extending its self-attention layers with cross-view interactions, and fine-tune it on a high quality subset of Objaverse. We find that a naive extension of the self-attention proposed in prior work (e.g. MVDream) leads to content copying between views. Therefore, we explicitly constrain the cross-view attention based on epipolar geometry. To further enhance 3D consistency, we utilize Plucker coordinates derived from camera rays and inject them as positional encoding. This enables SPAD to reason over spatial proximity in 3D well. In contrast to recent works that can only generate views at fixed azimuth and elevation, SPAD offers full camera control and achieves state-of-the-art results in novel view synthesis on unseen objects from the Objaverse and Google Scanned Objects datasets. Finally, we demonstrate that text-to-3D generation using SPAD prevents the multi-face Janus issue. See more details at our webpage: https://yashkant.github.io/spad <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05235v1-abstract-full').style.display = 'none'; document.getElementById('2402.05235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Webpage: https://yashkant.github.io/spad</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.04559">arXiv:2402.04559</a> <span> [<a href="https://arxiv.org/pdf/2402.04559">pdf</a>, <a href="https://arxiv.org/format/2402.04559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Can Large Language Model Agents Simulate Human Trust Behavior? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chengxing Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Canyu Chen</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+F">Feiran Jia</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Ziyu Ye</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+S">Shiyang Lai</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+K">Kai Shu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jindong Gu</a>, <a href="/search/cs?searchtype=author&query=Bibi%2C+A">Adel Bibi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Ziniu Hu</a>, <a href="/search/cs?searchtype=author&query=Jurgens%2C+D">David Jurgens</a>, <a href="/search/cs?searchtype=author&query=Evans%2C+J">James Evans</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guohao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.04559v4-abstract-short" style="display: inline;"> Large Language Model (LLM) agents have been increasingly adopted as simulation tools to model humans in social science and role-playing applications. However, one fundamental question remains: can LLM agents really simulate human behavior? In this paper, we focus on one critical and elemental behavior in human interactions, trust, and investigate whether LLM agents can simulate human trust behavio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04559v4-abstract-full').style.display = 'inline'; document.getElementById('2402.04559v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.04559v4-abstract-full" style="display: none;"> Large Language Model (LLM) agents have been increasingly adopted as simulation tools to model humans in social science and role-playing applications. However, one fundamental question remains: can LLM agents really simulate human behavior? In this paper, we focus on one critical and elemental behavior in human interactions, trust, and investigate whether LLM agents can simulate human trust behavior. We first find that LLM agents generally exhibit trust behavior, referred to as agent trust, under the framework of Trust Games, which are widely recognized in behavioral economics. Then, we discover that GPT-4 agents manifest high behavioral alignment with humans in terms of trust behavior, indicating the feasibility of simulating human trust behavior with LLM agents. In addition, we probe the biases of agent trust and differences in agent trust towards other LLM agents and humans. We also explore the intrinsic properties of agent trust under conditions including external manipulations and advanced reasoning strategies. Our study provides new insights into the behaviors of LLM agents and the fundamental analogy between LLMs and humans beyond value alignment. We further illustrate broader implications of our discoveries for applications where trust is paramount. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04559v4-abstract-full').style.display = 'none'; document.getElementById('2402.04559v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Proceedings of NeurIPS 2024. The first two authors contributed equally. 10 pages for main paper, 56 pages including appendix. Project website: https://agent-trust.camel-ai.org</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.01832">arXiv:2402.01832</a> <span> [<a href="https://arxiv.org/pdf/2402.01832">pdf</a>, <a href="https://arxiv.org/format/2402.01832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SynthCLIP: Are We Ready for a Fully Synthetic CLIP Training? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hammoud%2C+H+A+A+K">Hasan Abed Al Kader Hammoud</a>, <a href="/search/cs?searchtype=author&query=Itani%2C+H">Hani Itani</a>, <a href="/search/cs?searchtype=author&query=Pizzati%2C+F">Fabio Pizzati</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Bibi%2C+A">Adel Bibi</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.01832v2-abstract-short" style="display: inline;"> We present SynthCLIP, a CLIP model trained on entirely synthetic text-image pairs. Leveraging recent text-to-image (TTI) networks and large language models (LLM), we generate synthetic datasets of images and corresponding captions at scale, with no human intervention. In this work, we provide an analysis on CLIP models trained on synthetic data. We provide insights on the data generation strategy,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01832v2-abstract-full').style.display = 'inline'; document.getElementById('2402.01832v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.01832v2-abstract-full" style="display: none;"> We present SynthCLIP, a CLIP model trained on entirely synthetic text-image pairs. Leveraging recent text-to-image (TTI) networks and large language models (LLM), we generate synthetic datasets of images and corresponding captions at scale, with no human intervention. In this work, we provide an analysis on CLIP models trained on synthetic data. We provide insights on the data generation strategy, number of samples required, scaling trends, and resulting properties. We also introduce SynthCI-30M, a purely synthetic dataset comprising 30 million captioned images. Our code, trained models, and data, are released as open source at https://github.com/hammoudhasan/SynthCLIP <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01832v2-abstract-full').style.display = 'none'; document.getElementById('2402.01832v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00867">arXiv:2402.00867</a> <span> [<a href="https://arxiv.org/pdf/2402.00867">pdf</a>, <a href="https://arxiv.org/format/2402.00867">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AToM: Amortized Text-to-Mesh using 2D Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+G">Guocheng Qian</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Junli Cao</a>, <a href="/search/cs?searchtype=author&query=Siarohin%2C+A">Aliaksandr Siarohin</a>, <a href="/search/cs?searchtype=author&query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chaoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Vasilkovsky%2C+M">Michael Vasilkovsky</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hsin-Ying Lee</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuwei Fang</a>, <a href="/search/cs?searchtype=author&query=Skorokhodov%2C+I">Ivan Skorokhodov</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+P">Peiye Zhuang</a>, <a href="/search/cs?searchtype=author&query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jian Ren</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Aberman%2C+K">Kfir Aberman</a>, <a href="/search/cs?searchtype=author&query=Tulyakov%2C+S">Sergey Tulyakov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00867v1-abstract-short" style="display: inline;"> We introduce Amortized Text-to-Mesh (AToM), a feed-forward text-to-mesh framework optimized across multiple text prompts simultaneously. In contrast to existing text-to-3D methods that often entail time-consuming per-prompt optimization and commonly output representations other than polygonal meshes, AToM directly generates high-quality textured meshes in less than 1 second with around 10 times re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00867v1-abstract-full').style.display = 'inline'; document.getElementById('2402.00867v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00867v1-abstract-full" style="display: none;"> We introduce Amortized Text-to-Mesh (AToM), a feed-forward text-to-mesh framework optimized across multiple text prompts simultaneously. In contrast to existing text-to-3D methods that often entail time-consuming per-prompt optimization and commonly output representations other than polygonal meshes, AToM directly generates high-quality textured meshes in less than 1 second with around 10 times reduction in the training cost, and generalizes to unseen prompts. Our key idea is a novel triplane-based text-to-mesh architecture with a two-stage amortized optimization strategy that ensures stable training and enables scalability. Through extensive experiments on various prompt benchmarks, AToM significantly outperforms state-of-the-art amortized approaches with over 4 times higher accuracy (in DF415 dataset) and produces more distinguishable and higher-quality 3D outputs. AToM demonstrates strong generalizability, offering finegrained 3D assets for unseen interpolated prompts without further optimization during inference, unlike per-prompt solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00867v1-abstract-full').style.display = 'none'; document.getElementById('2402.00867v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages with appendix and references. Webpage: https://snap-research.github.io/AToM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.11470">arXiv:2401.11470</a> <span> [<a href="https://arxiv.org/pdf/2401.11470">pdf</a>, <a href="https://arxiv.org/format/2401.11470">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Missing Modality in Multimodal Egocentric Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ramazanova%2C+M">Merey Ramazanova</a>, <a href="/search/cs?searchtype=author&query=Pardo%2C+A">Alejandro Pardo</a>, <a href="/search/cs?searchtype=author&query=Alwassel%2C+H">Humam Alwassel</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.11470v2-abstract-short" style="display: inline;"> Multimodal video understanding is crucial for analyzing egocentric videos, where integrating multiple sensory signals significantly enhances action recognition and moment localization. However, practical applications often grapple with incomplete modalities due to factors like privacy concerns, efficiency demands, or hardware malfunctions. Addressing this, our study delves into the impact of missi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11470v2-abstract-full').style.display = 'inline'; document.getElementById('2401.11470v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.11470v2-abstract-full" style="display: none;"> Multimodal video understanding is crucial for analyzing egocentric videos, where integrating multiple sensory signals significantly enhances action recognition and moment localization. However, practical applications often grapple with incomplete modalities due to factors like privacy concerns, efficiency demands, or hardware malfunctions. Addressing this, our study delves into the impact of missing modalities on egocentric action recognition, particularly within transformer-based models. We introduce a novel concept -Missing Modality Token (MMT)-to maintain performance even when modalities are absent, a strategy that proves effective in the Ego4D, Epic-Kitchens, and Epic-Sounds datasets. Our method mitigates the performance loss, reducing it from its original $\sim 30\%$ drop to only $\sim 10\%$ when half of the test set is modal-incomplete. Through extensive experimentation, we demonstrate the adaptability of MMT to different training scenarios and its superiority in handling missing modalities compared to current methods. Our research contributes a comprehensive analysis and an innovative approach, opening avenues for more resilient multimodal systems in real-world settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11470v2-abstract-full').style.display = 'none'; document.getElementById('2401.11470v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10228">arXiv:2401.10228</a> <span> [<a href="https://arxiv.org/pdf/2401.10228">pdf</a>, <a href="https://arxiv.org/format/2401.10228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RAP-SAM: Towards Real-Time All-Purpose Segment Anything </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shilin Xu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Haobo Yuan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Q">Qingyu Shi</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+L">Lu Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingbo Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yibo Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yining Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+Y">Yunhai Tong</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10228v1-abstract-short" style="display: inline;"> Advanced by transformer architecture, vision foundation models (VFMs) achieve remarkable progress in performance and generalization ability. Segment Anything Model (SAM) is one remarkable model that can achieve generalized segmentation. However, most VFMs cannot run in realtime, which makes it difficult to transfer them into several products. On the other hand, current real-time segmentation mainl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10228v1-abstract-full').style.display = 'inline'; document.getElementById('2401.10228v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10228v1-abstract-full" style="display: none;"> Advanced by transformer architecture, vision foundation models (VFMs) achieve remarkable progress in performance and generalization ability. Segment Anything Model (SAM) is one remarkable model that can achieve generalized segmentation. However, most VFMs cannot run in realtime, which makes it difficult to transfer them into several products. On the other hand, current real-time segmentation mainly has one purpose, such as semantic segmentation on the driving scene. We argue that diverse outputs are needed for real applications. Thus, this work explores a new real-time segmentation setting, named all-purpose segmentation in real-time, to transfer VFMs in real-time deployment. It contains three different tasks, including interactive segmentation, panoptic segmentation, and video segmentation. We aim to use one model to achieve the above tasks in real-time. We first benchmark several strong baselines. Then, we present Real-Time All Purpose SAM (RAP-SAM). It contains an efficient encoder and an efficient decoupled decoder to perform prompt-driven decoding. Moreover, we further explore different training strategies and tuning methods to boost co-training performance further. Our code and model are available at https://github.com/xushilin1/RAP-SAM/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10228v1-abstract-full').style.display = 'none'; document.getElementById('2401.10228v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://xushilin1.github.io/rap_sam/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.04105">arXiv:2401.04105</a> <span> [<a href="https://arxiv.org/pdf/2401.04105">pdf</a>, <a href="https://arxiv.org/format/2401.04105">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dr$^2$Net: Dynamic Reversible Dual-Residual Networks for Memory-Efficient Finetuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuming Liu</a>, <a href="/search/cs?searchtype=author&query=Mangalam%2C+K">Karttikeya Mangalam</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+G">Guocheng Qian</a>, <a href="/search/cs?searchtype=author&query=Zohra%2C+F">Fatimah Zohra</a>, <a href="/search/cs?searchtype=author&query=Alghannam%2C+A">Abdulmohsen Alghannam</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+J">Jitendra Malik</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.04105v2-abstract-short" style="display: inline;"> Large pretrained models are increasingly crucial in modern computer vision tasks. These models are typically used in downstream tasks by end-to-end finetuning, which is highly memory-intensive for tasks with high-resolution data, e.g., video understanding, small object detection, and point cloud analysis. In this paper, we propose Dynamic Reversible Dual-Residual Networks, or Dr$^2$Net, a novel fa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04105v2-abstract-full').style.display = 'inline'; document.getElementById('2401.04105v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.04105v2-abstract-full" style="display: none;"> Large pretrained models are increasingly crucial in modern computer vision tasks. These models are typically used in downstream tasks by end-to-end finetuning, which is highly memory-intensive for tasks with high-resolution data, e.g., video understanding, small object detection, and point cloud analysis. In this paper, we propose Dynamic Reversible Dual-Residual Networks, or Dr$^2$Net, a novel family of network architectures that acts as a surrogate network to finetune a pretrained model with substantially reduced memory consumption. Dr$^2$Net contains two types of residual connections, one maintaining the residual structure in the pretrained models, and the other making the network reversible. Due to its reversibility, intermediate activations, which can be reconstructed from output, are cleared from memory during training. We use two coefficients on either type of residual connections respectively, and introduce a dynamic training strategy that seamlessly transitions the pretrained model to a reversible network with much higher numerical precision. We evaluate Dr$^2$Net on various pretrained models and various tasks, and show that it can reach comparable performance to conventional finetuning but with significantly less memory usage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04105v2-abstract-full').style.display = 'none'; document.getElementById('2401.04105v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.12487">arXiv:2312.12487</a> <span> [<a href="https://arxiv.org/pdf/2312.12487">pdf</a>, <a href="https://arxiv.org/format/2312.12487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Guidance: Training-free Acceleration of Conditional Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Castillo%2C+A">Angela Castillo</a>, <a href="/search/cs?searchtype=author&query=Kohler%2C+J">Jonas Kohler</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rez%2C+J+P">Juan Pablo P茅rez</a>, <a href="/search/cs?searchtype=author&query=Pumarola%2C+A">Albert Pumarola</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&query=Thabet%2C+A">Ali Thabet</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.12487v1-abstract-short" style="display: inline;"> This paper presents a comprehensive study on the role of Classifier-Free Guidance (CFG) in text-conditioned diffusion models from the perspective of inference efficiency. In particular, we relax the default choice of applying CFG in all diffusion steps and instead search for efficient guidance policies. We formulate the discovery of such policies in the differentiable Neural Architecture Search fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12487v1-abstract-full').style.display = 'inline'; document.getElementById('2312.12487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.12487v1-abstract-full" style="display: none;"> This paper presents a comprehensive study on the role of Classifier-Free Guidance (CFG) in text-conditioned diffusion models from the perspective of inference efficiency. In particular, we relax the default choice of applying CFG in all diffusion steps and instead search for efficient guidance policies. We formulate the discovery of such policies in the differentiable Neural Architecture Search framework. Our findings suggest that the denoising steps proposed by CFG become increasingly aligned with simple conditional steps, which renders the extra neural network evaluation of CFG redundant, especially in the second half of the denoising process. Building upon this insight, we propose "Adaptive Guidance" (AG), an efficient variant of CFG, that adaptively omits network evaluations when the denoising process displays convergence. Our experiments demonstrate that AG preserves CFG's image quality while reducing computation by 25%. Thus, AG constitutes a plug-and-play alternative to Guidance Distillation, achieving 50% of the speed-ups of the latter while being training-free and retaining the capacity to handle negative prompts. Finally, we uncover further redundancies of CFG in the first half of the diffusion process, showing that entire neural function evaluations can be replaced by simple affine transformations of past score estimates. This method, termed LinearAG, offers even cheaper inference at the cost of deviating from the baseline model. Our findings provide insights into the efficiency of the conditional denoising process that contribute to more practical and swift deployment of text-conditioned diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12487v1-abstract-full').style.display = 'none'; document.getElementById('2312.12487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.10639">arXiv:2312.10639</a> <span> [<a href="https://arxiv.org/pdf/2312.10639">pdf</a>, <a href="https://arxiv.org/format/2312.10639">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Artificial intelligence optical hardware empowers high-resolution hyperspectral video understanding at 1.2 Tb/s </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Makarenko%2C+M">Maksim Makarenko</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Burguete-Lopez%2C+A">Arturo Burguete-Lopez</a>, <a href="/search/cs?searchtype=author&query=Giancola%2C+S">Silvio Giancola</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Passone%2C+L">Luca Passone</a>, <a href="/search/cs?searchtype=author&query=Fratalocchi%2C+A">Andrea Fratalocchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.10639v1-abstract-short" style="display: inline;"> Foundation models, exemplified by GPT technology, are discovering new horizons in artificial intelligence by executing tasks beyond their designers' expectations. While the present generation provides fundamental advances in understanding language and images, the next frontier is video comprehension. Progress in this area must overcome the 1 Tb/s data rate demanded to grasp real-time multidimensio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10639v1-abstract-full').style.display = 'inline'; document.getElementById('2312.10639v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.10639v1-abstract-full" style="display: none;"> Foundation models, exemplified by GPT technology, are discovering new horizons in artificial intelligence by executing tasks beyond their designers' expectations. While the present generation provides fundamental advances in understanding language and images, the next frontier is video comprehension. Progress in this area must overcome the 1 Tb/s data rate demanded to grasp real-time multidimensional video information. This speed limit lies well beyond the capabilities of the existing generation of hardware, imposing a roadblock to further advances. This work introduces a hardware-accelerated integrated optoelectronic platform for multidimensional video understanding in real-time. The technology platform combines artificial intelligence hardware, processing information optically, with state-of-the-art machine vision networks, resulting in a data processing speed of 1.2 Tb/s with hundreds of frequency bands and megapixel spatial resolution at video rates. Such performance, validated in the AI tasks of video semantic segmentation and object understanding in indoor and aerial applications, surpasses the speed of the closest technologies with similar spectral resolution by three to four orders of magnitude. This platform opens up new avenues for research in real-time AI video understanding of multidimensional visual information, helping the empowerment of future human-machine interactions and cognitive processing developments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10639v1-abstract-full').style.display = 'none'; document.getElementById('2312.10639v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.02219">arXiv:2312.02219</a> <span> [<a href="https://arxiv.org/pdf/2312.02219">pdf</a>, <a href="https://arxiv.org/format/2312.02219">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Behind the Magic, MERLIM: Multi-modal Evaluation Benchmark for Large Image-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Villa%2C+A">Andr茅s Villa</a>, <a href="/search/cs?searchtype=author&query=Alc%C3%A1zar%2C+J+C+L">Juan Carlos Le贸n Alc谩zar</a>, <a href="/search/cs?searchtype=author&query=Soto%2C+A">Alvaro Soto</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.02219v2-abstract-short" style="display: inline;"> Large Vision and Language Models have enabled significant advances in fully supervised and zero-shot visual tasks. These large architectures serve as the baseline to what is currently known as Instruction Tuning Large Vision and Language models (IT-LVLMs). IT-LVLMs are general-purpose multi-modal assistants whose responses are modulated by natural language instructions and visual data. Despite thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02219v2-abstract-full').style.display = 'inline'; document.getElementById('2312.02219v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.02219v2-abstract-full" style="display: none;"> Large Vision and Language Models have enabled significant advances in fully supervised and zero-shot visual tasks. These large architectures serve as the baseline to what is currently known as Instruction Tuning Large Vision and Language models (IT-LVLMs). IT-LVLMs are general-purpose multi-modal assistants whose responses are modulated by natural language instructions and visual data. Despite this versatility, IT-LVLM effectiveness in fundamental computer vision problems remains unclear, primarily due to the absence of a standardized evaluation benchmark. This paper introduces a Multi-modal Evaluation Benchmark named MERLIM, a scalable test-bed to assess the capabilities of IT-LVLMs on fundamental computer vision tasks. MERLIM contains over 300K image-question pairs and has a strong focus on detecting cross-modal "hallucination" events in IT-LVLMs. Our results bring important insights on the performance of state-of-the-art IT-LVMLs including limitations at identifying fine-grained visual concepts, object hallucinations across tasks, and biases towards the language query. Our findings also suggest that these models have weak visual grounding, but manage to make adequate guesses from global visual patterns or language biases contained in the LLM component. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02219v2-abstract-full').style.display = 'none'; document.getElementById('2312.02219v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 7 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.18259">arXiv:2311.18259</a> <span> [<a href="https://arxiv.org/pdf/2311.18259">pdf</a>, <a href="https://arxiv.org/format/2311.18259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Ego-Exo4D: Understanding Skilled Human Activity from First- and Third-Person Perspectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grauman%2C+K">Kristen Grauman</a>, <a href="/search/cs?searchtype=author&query=Westbury%2C+A">Andrew Westbury</a>, <a href="/search/cs?searchtype=author&query=Torresani%2C+L">Lorenzo Torresani</a>, <a href="/search/cs?searchtype=author&query=Kitani%2C+K">Kris Kitani</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+J">Jitendra Malik</a>, <a href="/search/cs?searchtype=author&query=Afouras%2C+T">Triantafyllos Afouras</a>, <a href="/search/cs?searchtype=author&query=Ashutosh%2C+K">Kumar Ashutosh</a>, <a href="/search/cs?searchtype=author&query=Baiyya%2C+V">Vijay Baiyya</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+S">Siddhant Bansal</a>, <a href="/search/cs?searchtype=author&query=Boote%2C+B">Bikram Boote</a>, <a href="/search/cs?searchtype=author&query=Byrne%2C+E">Eugene Byrne</a>, <a href="/search/cs?searchtype=author&query=Chavis%2C+Z">Zach Chavis</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Joya Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+F">Feng Cheng</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+F">Fu-Jen Chu</a>, <a href="/search/cs?searchtype=author&query=Crane%2C+S">Sean Crane</a>, <a href="/search/cs?searchtype=author&query=Dasgupta%2C+A">Avijit Dasgupta</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J">Jing Dong</a>, <a href="/search/cs?searchtype=author&query=Escobar%2C+M">Maria Escobar</a>, <a href="/search/cs?searchtype=author&query=Forigua%2C+C">Cristhian Forigua</a>, <a href="/search/cs?searchtype=author&query=Gebreselasie%2C+A">Abrham Gebreselasie</a>, <a href="/search/cs?searchtype=author&query=Haresh%2C+S">Sanjay Haresh</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jing Huang</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+M+M">Md Mohaiminul Islam</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+S">Suyog Jain</a> , et al. (76 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18259v4-abstract-short" style="display: inline;"> We present Ego-Exo4D, a diverse, large-scale multimodal multiview video dataset and benchmark challenge. Ego-Exo4D centers around simultaneously-captured egocentric and exocentric video of skilled human activities (e.g., sports, music, dance, bike repair). 740 participants from 13 cities worldwide performed these activities in 123 different natural scene contexts, yielding long-form captures from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18259v4-abstract-full').style.display = 'inline'; document.getElementById('2311.18259v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18259v4-abstract-full" style="display: none;"> We present Ego-Exo4D, a diverse, large-scale multimodal multiview video dataset and benchmark challenge. Ego-Exo4D centers around simultaneously-captured egocentric and exocentric video of skilled human activities (e.g., sports, music, dance, bike repair). 740 participants from 13 cities worldwide performed these activities in 123 different natural scene contexts, yielding long-form captures from 1 to 42 minutes each and 1,286 hours of video combined. The multimodal nature of the dataset is unprecedented: the video is accompanied by multichannel audio, eye gaze, 3D point clouds, camera poses, IMU, and multiple paired language descriptions -- including a novel "expert commentary" done by coaches and teachers and tailored to the skilled-activity domain. To push the frontier of first-person video understanding of skilled human activity, we also present a suite of benchmark tasks and their annotations, including fine-grained activity understanding, proficiency estimation, cross-view translation, and 3D hand/body pose. All resources are open sourced to fuel new research in the community. Project page: http://ego-exo4d-data.org/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18259v4-abstract-full').style.display = 'none'; document.getElementById('2311.18259v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Expanded manuscript (compared to arxiv v1 from Nov 2023 and CVPR 2024 paper from June 2024) for more comprehensive dataset and benchmark presentation, plus new results on v2 data release</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.17241">arXiv:2311.17241</a> <span> [<a href="https://arxiv.org/pdf/2311.17241">pdf</a>, <a href="https://arxiv.org/format/2311.17241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> End-to-End Temporal Action Detection with 1B Parameters Across 1000 Frames </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chen-Lin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.17241v2-abstract-short" style="display: inline;"> Recently, temporal action detection (TAD) has seen significant performance improvement with end-to-end training. However, due to the memory bottleneck, only models with limited scales and limited data volumes can afford end-to-end training, which inevitably restricts TAD performance. In this paper, we reduce the memory consumption for end-to-end training, and manage to scale up the TAD backbone to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17241v2-abstract-full').style.display = 'inline'; document.getElementById('2311.17241v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.17241v2-abstract-full" style="display: none;"> Recently, temporal action detection (TAD) has seen significant performance improvement with end-to-end training. However, due to the memory bottleneck, only models with limited scales and limited data volumes can afford end-to-end training, which inevitably restricts TAD performance. In this paper, we reduce the memory consumption for end-to-end training, and manage to scale up the TAD backbone to 1 billion parameters and the input video to 1,536 frames, leading to significant detection performance. The key to our approach lies in our proposed temporal-informative adapter (TIA), which is a novel lightweight module that reduces training memory. Using TIA, we free the humongous backbone from learning to adapt to the TAD task by only updating the parameters in TIA. TIA also leads to better TAD representation by temporally aggregating context from adjacent frames throughout the backbone. We evaluate our model across four representative datasets. Owing to our efficient design, we are able to train end-to-end on VideoMAEv2-giant and achieve 75.4% mAP on THUMOS14, being the first end-to-end model to outperform the best feature-based methods. Code is available at https://github.com/sming256/AdaTAD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17241v2-abstract-full').style.display = 'none'; document.getElementById('2311.17241v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024. Camera-Ready Version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.16671">arXiv:2311.16671</a> <span> [<a href="https://arxiv.org/pdf/2311.16671">pdf</a>, <a href="https://arxiv.org/format/2311.16671">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> SplitNeRF: Split Sum Approximation Neural Field for Joint Geometry, Illumination, and Material Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zarzar%2C+J">Jesus Zarzar</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.16671v1-abstract-short" style="display: inline;"> We present a novel approach for digitizing real-world objects by estimating their geometry, material properties, and environmental lighting from a set of posed images with fixed lighting. Our method incorporates into Neural Radiance Field (NeRF) pipelines the split sum approximation used with image-based lighting for real-time physical-based rendering. We propose modeling the scene's lighting with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16671v1-abstract-full').style.display = 'inline'; document.getElementById('2311.16671v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.16671v1-abstract-full" style="display: none;"> We present a novel approach for digitizing real-world objects by estimating their geometry, material properties, and environmental lighting from a set of posed images with fixed lighting. Our method incorporates into Neural Radiance Field (NeRF) pipelines the split sum approximation used with image-based lighting for real-time physical-based rendering. We propose modeling the scene's lighting with a single scene-specific MLP representing pre-integrated image-based lighting at arbitrary resolutions. We achieve accurate modeling of pre-integrated lighting by exploiting a novel regularizer based on efficient Monte Carlo sampling. Additionally, we propose a new method of supervising self-occlusion predictions by exploiting a similar regularizer based on Monte Carlo sampling. Experimental results demonstrate the efficiency and effectiveness of our approach in estimating scene geometry, material properties, and lighting. Our method is capable of attaining state-of-the-art relighting quality after only ${\sim}1$ hour of training in a single NVIDIA A100 GPU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16671v1-abstract-full').style.display = 'none'; document.getElementById('2311.16671v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.11293">arXiv:2311.11293</a> <span> [<a href="https://arxiv.org/pdf/2311.11293">pdf</a>, <a href="https://arxiv.org/format/2311.11293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> From Categories to Classifiers: Name-Only Continual Learning by Exploring the Web </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Prabhu%2C+A">Ameya Prabhu</a>, <a href="/search/cs?searchtype=author&query=Hammoud%2C+H+A+A+K">Hasan Abed Al Kader Hammoud</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+S">Ser-Nam Lim</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P+H+S">Philip H. S. Torr</a>, <a href="/search/cs?searchtype=author&query=Bibi%2C+A">Adel Bibi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.11293v2-abstract-short" style="display: inline;"> Continual Learning (CL) often relies on the availability of extensive annotated datasets, an assumption that is unrealistically time-consuming and costly in practice. We explore a novel paradigm termed name-only continual learning where time and cost constraints prohibit manual annotation. In this scenario, learners adapt to new category shifts using only category names without the luxury of annot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11293v2-abstract-full').style.display = 'inline'; document.getElementById('2311.11293v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.11293v2-abstract-full" style="display: none;"> Continual Learning (CL) often relies on the availability of extensive annotated datasets, an assumption that is unrealistically time-consuming and costly in practice. We explore a novel paradigm termed name-only continual learning where time and cost constraints prohibit manual annotation. In this scenario, learners adapt to new category shifts using only category names without the luxury of annotated training data. Our proposed solution leverages the expansive and ever-evolving internet to query and download uncurated webly-supervised data for image classification. We investigate the reliability of our web data and find them comparable, and in some cases superior, to manually annotated datasets. Additionally, we show that by harnessing the web, we can create support sets that surpass state-of-the-art name-only classification that create support sets using generative models or image retrieval from LAION-5B, achieving up to 25% boost in accuracy. When applied across varied continual learning contexts, our method consistently exhibits a small performance gap in comparison to models trained on manually annotated datasets. We present EvoTrends, a class-incremental dataset made from the web to capture real-world trends, created in just minutes. Overall, this paper underscores the potential of using uncurated webly-supervised data to mitigate the challenges associated with manual data labeling in continual learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11293v2-abstract-full').style.display = 'none'; document.getElementById('2311.11293v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.08358">arXiv:2310.08358</a> <span> [<a href="https://arxiv.org/pdf/2310.08358">pdf</a>, <a href="https://arxiv.org/format/2310.08358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Demystifying the Generalization Behaviors When Neural Collapse Emerges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+P">Peifeng Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qianqian Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yibo Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+P">Peisong Wen</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+H">Huiyang Shao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qingming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.08358v1-abstract-short" style="display: inline;"> Neural Collapse (NC) is a well-known phenomenon of deep neural networks in the terminal phase of training (TPT). It is characterized by the collapse of features and classifier into a symmetrical structure, known as simplex equiangular tight frame (ETF). While there have been extensive studies on optimization characteristics showing the global optimality of neural collapse, little research has been… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08358v1-abstract-full').style.display = 'inline'; document.getElementById('2310.08358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.08358v1-abstract-full" style="display: none;"> Neural Collapse (NC) is a well-known phenomenon of deep neural networks in the terminal phase of training (TPT). It is characterized by the collapse of features and classifier into a symmetrical structure, known as simplex equiangular tight frame (ETF). While there have been extensive studies on optimization characteristics showing the global optimality of neural collapse, little research has been done on the generalization behaviors during the occurrence of NC. Particularly, the important phenomenon of generalization improvement during TPT has been remaining in an empirical observation and lacking rigorous theoretical explanation. In this paper, we establish the connection between the minimization of CE and a multi-class SVM during TPT, and then derive a multi-class margin generalization bound, which provides a theoretical explanation for why continuing training can still lead to accuracy improvement on test set, even after the train accuracy has reached 100%. Additionally, our further theoretical results indicate that different alignment between labels and features in a simplex ETF can result in varying degrees of generalization improvement, despite all models reaching NC and demonstrating similar optimization performance on train set. We refer to this newly discovered property as "non-conservative generalization". In experiments, we also provide empirical observations to verify the indications suggested by our theoretical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08358v1-abstract-full').style.display = 'none'; document.getElementById('2310.08358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 6 figures. arXiv admin note: substantial text overlap with arXiv:2304.08914</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Ghanem%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository