CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 608 results for author: <span class="mathjax">Tang, Z</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Tang%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Tang, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Tang%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Tang, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12154">arXiv:2502.12154</a> <span> [<a href="https://arxiv.org/pdf/2502.12154">pdf</a>, <a href="https://arxiv.org/format/2502.12154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Models without Classifier-free Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhicong Tang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+J">Jianmin Bao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+B">Baining Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12154v1-abstract-short" style="display: inline;"> This paper presents Model-guidance (MG), a novel objective for training diffusion model that addresses and removes of the commonly used Classifier-free guidance (CFG). Our innovative approach transcends the standard modeling of solely data distribution to incorporating the posterior probability of conditions. The proposed technique originates from the idea of CFG and is easy yet effective, making… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12154v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12154v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12154v1-abstract-full" style="display: none;"> This paper presents Model-guidance (MG), a novel objective for training diffusion model that addresses and removes of the commonly used Classifier-free guidance (CFG). Our innovative approach transcends the standard modeling of solely data distribution to incorporating the posterior probability of conditions. The proposed technique originates from the idea of CFG and is easy yet effective, making it a plug-and-play module for existing models. Our method significantly accelerates the training process, doubles the inference speed, and achieve exceptional quality that parallel and even surpass concurrent diffusion models with CFG. Extensive experiments demonstrate the effectiveness, efficiency, scalability on different models and datasets. Finally, we establish state-of-the-art performance on ImageNet 256 benchmarks with an FID of 1.34. Our code is available at https://github.com/tzco/Diffusion-wo-CFG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12154v1-abstract-full').style.display = 'none'; document.getElementById('2502.12154v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11058">arXiv:2502.11058</a> <span> [<a href="https://arxiv.org/pdf/2502.11058">pdf</a>, <a href="https://arxiv.org/format/2502.11058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> DreamDDP: Accelerating Data Parallel Distributed LLM Training with Layer-wise Scheduled Partial Synchronization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zichen Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junlin Huang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xinglin Pan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rudan Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A+C">Amelie Chi Zhou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Shaohuai Shi</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11058v1-abstract-short" style="display: inline;"> The growth of large language models (LLMs) increases challenges of accelerating distributed training across multiple GPUs in different data centers. Moreover, concerns about data privacy and data exhaustion have heightened interest in geo-distributed data centers. Communication in geo-distributed data parallel training (DDP) with stochastic gradient descent (S-SGD) is the main bottleneck in low-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11058v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11058v1-abstract-full" style="display: none;"> The growth of large language models (LLMs) increases challenges of accelerating distributed training across multiple GPUs in different data centers. Moreover, concerns about data privacy and data exhaustion have heightened interest in geo-distributed data centers. Communication in geo-distributed data parallel training (DDP) with stochastic gradient descent (S-SGD) is the main bottleneck in low-bandwidth environments. Local SGD mitigates communication overhead by reducing synchronization frequency, and recent studies have successfully applied it to geo-distributedly pre-train LLMs. However, we identify that its model synchronization mechanism prevents overlapping communication and computation, which makes the system lose opportunities to overlap communication and computation. To overcome this limitation, we expand the design space of local SGD by layer-wisely decoupling model synchronization. In each iteration, only some layers are synchronized instead of the entire model after a specific number of iterations. Leveraging this methodology, we introduce DreamDDP, a training framework to accelerate low-bandwidth distributed training with three key innovations: (1) partial local SGD with theoretical assurances of convergence rates comparable to S-SGD; (2) overlapping parameter synchronization with computation without extra GPU memory occupation; (3) identifying and exploiting three properties to schedule the communication and computation to reduce the training time based on fine-grained profiling of layer-wise communication and computation time. Empirical evaluations conducted on 32 GPUs using prominent deep learning models, including ResNet-18, ResNet-50, GPT-2, and Llama-2, demonstrate that DreamDDP enhances the convergence properties of Local SGD (and Adam) and achieves speedups ranging from $1.49\times$ to $3.91\times$ over leading baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11058v1-abstract-full').style.display = 'none'; document.getElementById('2502.11058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10248">arXiv:2502.10248</a> <span> [<a href="https://arxiv.org/pdf/2502.10248">pdf</a>, <a href="https://arxiv.org/format/2502.10248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guoqing Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyang Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kun Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shengming Yin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+R">Ranchen Ming</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoniu Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Deshan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Deyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K">Kaijun Tan</a>, <a href="/search/cs?searchtype=author&query=An%2C+K">Kang An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mei Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiling Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wen Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xin Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yanan Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zheng Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aojie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a> , et al. (90 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10248v2-abstract-short" style="display: inline;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v2-abstract-full').style.display = 'inline'; document.getElementById('2502.10248v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10248v2-abstract-full" style="display: none;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded using two bilingual text encoders to handle both English and Chinese. A DiT with 3D full attention is trained using Flow Matching and is employed to denoise input noise into latent frames. A video-based DPO approach, Video-DPO, is applied to reduce artifacts and improve the visual quality of the generated videos. We also detail our training strategies and share key observations and insights. Step-Video-T2V's performance is evaluated on a novel video generation benchmark, Step-Video-T2V-Eval, demonstrating its state-of-the-art text-to-video quality when compared with both open-source and commercial engines. Additionally, we discuss the limitations of current diffusion-based model paradigm and outline future directions for video foundation models. We make both Step-Video-T2V and Step-Video-T2V-Eval available at https://github.com/stepfun-ai/Step-Video-T2V. The online version can be accessed from https://yuewen.cn/videos as well. Our goal is to accelerate the innovation of video foundation models and empower video content creators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v2-abstract-full').style.display = 'none'; document.getElementById('2502.10248v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09104">arXiv:2502.09104</a> <span> [<a href="https://arxiv.org/pdf/2502.09104">pdf</a>, <a href="https://arxiv.org/ps/2502.09104">ps</a>, <a href="https://arxiv.org/format/2502.09104">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> One-shot Federated Learning Methods: A Practical Guide </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xia Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yijun Song</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Sijie Ji</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zemin Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Linshan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09104v1-abstract-short" style="display: inline;"> One-shot Federated Learning (OFL) is a distributed machine learning paradigm that constrains client-server communication to a single round, addressing privacy and communication overhead issues associated with multiple rounds of data exchange in traditional Federated Learning (FL). OFL demonstrates the practical potential for integration with future approaches that require collaborative training mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09104v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09104v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09104v1-abstract-full" style="display: none;"> One-shot Federated Learning (OFL) is a distributed machine learning paradigm that constrains client-server communication to a single round, addressing privacy and communication overhead issues associated with multiple rounds of data exchange in traditional Federated Learning (FL). OFL demonstrates the practical potential for integration with future approaches that require collaborative training models, such as large language models (LLMs). However, current OFL methods face two major challenges: data heterogeneity and model heterogeneity, which result in subpar performance compared to conventional FL methods. Worse still, despite numerous studies addressing these limitations, a comprehensive summary is still lacking. To address these gaps, this paper presents a systematic analysis of the challenges faced by OFL and thoroughly reviews the current methods. We also offer an innovative categorization method and analyze the trade-offs of various techniques. Additionally, we discuss the most promising future directions and the technologies that should be integrated into the OFL field. This work aims to provide guidance and insights for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09104v1-abstract-full').style.display = 'none'; document.getElementById('2502.09104v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08967">arXiv:2502.08967</a> <span> [<a href="https://arxiv.org/pdf/2502.08967">pdf</a>, <a href="https://arxiv.org/format/2502.08967">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Low Complexity Artificial Noise Aided Beam Focusing Design in Near-Field Terahertz Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhifeng Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+N">Nan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiangyun Zhou</a>, <a href="/search/cs?searchtype=author&query=Durrani%2C+S">Salman Durrani</a>, <a href="/search/cs?searchtype=author&query=Juntti%2C+M">Markku Juntti</a>, <a href="/search/cs?searchtype=author&query=Jornet%2C+J+M">Josep Miquel Jornet</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08967v1-abstract-short" style="display: inline;"> In this paper, we develop a novel low-complexity artificial noise (AN) aided beam focusing scheme in a near-field terahertz wiretap communication system. In this system, the base station (BS) equipped with a large-scale array transmits signals to a legitimate user, while mitigating information leakage to an eavesdropper. We formulate an optimization problem to maximize the secrecy rate achieved at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08967v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08967v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08967v1-abstract-full" style="display: none;"> In this paper, we develop a novel low-complexity artificial noise (AN) aided beam focusing scheme in a near-field terahertz wiretap communication system. In this system, the base station (BS) equipped with a large-scale array transmits signals to a legitimate user, while mitigating information leakage to an eavesdropper. We formulate an optimization problem to maximize the secrecy rate achieved at the legitimate user and solve it by designing the optimal beam focusing and power allocation. Numerical results demonstrate the significant performance improvement achieved by the proposed AN aided beam focusing scheme, especially when the eavesdropper is located closer to the BS than the legitimate user. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08967v1-abstract-full').style.display = 'none'; document.getElementById('2502.08967v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07243">arXiv:2502.07243</a> <span> [<a href="https://arxiv.org/pdf/2502.07243">pdf</a>, <a href="https://arxiv.org/format/2502.07243">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xueyao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+K">Kainan Peng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenyu Tang</a>, <a href="/search/cs?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yingru Liu</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+J">Jeff Hwang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dangna Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhao Wang</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+J">Julian Chan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuan Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhizheng Wu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Mingbo Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07243v1-abstract-short" style="display: inline;"> The imitation of voice, targeted on specific speech attributes such as timbre and speaking style, is crucial in speech generation. However, existing methods rely heavily on annotated data, and struggle with effectively disentangling timbre and style, leading to challenges in achieving controllable generation, especially in zero-shot scenarios. To address these issues, we propose Vevo, a versatile… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07243v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07243v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07243v1-abstract-full" style="display: none;"> The imitation of voice, targeted on specific speech attributes such as timbre and speaking style, is crucial in speech generation. However, existing methods rely heavily on annotated data, and struggle with effectively disentangling timbre and style, leading to challenges in achieving controllable generation, especially in zero-shot scenarios. To address these issues, we propose Vevo, a versatile zero-shot voice imitation framework with controllable timbre and style. Vevo operates in two core stages: (1) Content-Style Modeling: Given either text or speech's content tokens as input, we utilize an autoregressive transformer to generate the content-style tokens, which is prompted by a style reference; (2) Acoustic Modeling: Given the content-style tokens as input, we employ a flow-matching transformer to produce acoustic representations, which is prompted by a timbre reference. To obtain the content and content-style tokens of speech, we design a fully self-supervised approach that progressively decouples the timbre, style, and linguistic content of speech. Specifically, we adopt VQ-VAE as the tokenizer for the continuous hidden features of HuBERT. We treat the vocabulary size of the VQ-VAE codebook as the information bottleneck, and adjust it carefully to obtain the disentangled speech representations. Solely self-supervised trained on 60K hours of audiobook speech data, without any fine-tuning on style-specific corpora, Vevo matches or surpasses existing methods in accent and emotion conversion tasks. Additionally, Vevo's effectiveness in zero-shot voice conversion and text-to-speech tasks further demonstrates its strong generalization and versatility. Audio samples are available at https://versavoice.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07243v1-abstract-full').style.display = 'none'; document.getElementById('2502.07243v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07058">arXiv:2502.07058</a> <span> [<a href="https://arxiv.org/pdf/2502.07058">pdf</a>, <a href="https://arxiv.org/format/2502.07058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Using Contextually Aligned Online Reviews to Measure LLMs' Performance Disparities Across Language Varieties </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zixin Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chieh-Yang Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tsung-Chi Li</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+H+Y+S">Ho Yin Sam Ng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hen-Hsen Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T+%27">Ting-Hao 'Kenneth' Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07058v2-abstract-short" style="display: inline;"> A language can have different varieties. These varieties can affect the performance of natural language processing (NLP) models, including large language models (LLMs), which are often trained on data from widely spoken varieties. This paper introduces a novel and cost-effective approach to benchmark model performance across language varieties. We argue that international online review platforms,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07058v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07058v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07058v2-abstract-full" style="display: none;"> A language can have different varieties. These varieties can affect the performance of natural language processing (NLP) models, including large language models (LLMs), which are often trained on data from widely spoken varieties. This paper introduces a novel and cost-effective approach to benchmark model performance across language varieties. We argue that international online review platforms, such as Booking.com, can serve as effective data sources for constructing datasets that capture comments in different language varieties from similar real-world scenarios, like reviews for the same hotel with the same rating using the same language (e.g., Mandarin Chinese) but different language varieties (e.g., Taiwan Mandarin, Mainland Mandarin). To prove this concept, we constructed a contextually aligned dataset comprising reviews in Taiwan Mandarin and Mainland Mandarin and tested six LLMs in a sentiment analysis task. Our results show that LLMs consistently underperform in Taiwan Mandarin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07058v2-abstract-full').style.display = 'none'; document.getElementById('2502.07058v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2025 Annual Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics (NAACL), theme track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05855">arXiv:2502.05855</a> <span> [<a href="https://arxiv.org/pdf/2502.05855">pdf</a>, <a href="https://arxiv.org/format/2502.05855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DexVLA: Vision-Language Model with Plug-In Diffusion Expert for General Robot Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+J">Junjie Wen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yichen Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinming Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhibin Tang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chaomin Shen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Feifei Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05855v1-abstract-short" style="display: inline;"> Enabling robots to perform diverse tasks across varied environments is a central challenge in robot learning. While vision-language-action (VLA) models have shown promise for generalizable robot skills, realizing their full potential requires addressing limitations in action representation and efficient training. Current VLA models often focus on scaling the vision-language model (VLM) component,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05855v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05855v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05855v1-abstract-full" style="display: none;"> Enabling robots to perform diverse tasks across varied environments is a central challenge in robot learning. While vision-language-action (VLA) models have shown promise for generalizable robot skills, realizing their full potential requires addressing limitations in action representation and efficient training. Current VLA models often focus on scaling the vision-language model (VLM) component, while the action space representation remains a critical bottleneck. This paper introduces DexVLA, a novel framework designed to enhance the efficiency and generalization capabilities of VLAs for complex, long-horizon tasks across diverse robot embodiments. DexVLA features a novel diffusion-based action expert, scaled to one billion parameters, designed for cross-embodiment learning. A novel embodiment curriculum learning strategy facilitates efficient training: (1) pre-training the diffusion expert that is separable from the VLA on cross-embodiment data, (2) aligning the VLA model to specific embodiments, and (3) post-training for rapid adaptation to new tasks. We conduct comprehensive experiments across multiple embodiments, including single-arm, bimanual, and dexterous hand, demonstrating DexVLA's adaptability to challenging tasks without task-specific adaptation, its ability to learn dexterous skills on novel embodiments with limited data, and its capacity to complete complex, long-horizon tasks using only direct language prompting, such as laundry folding. In all settings, our method demonstrates superior performance compared to state-of-the-art models like Octo, OpenVLA, and Diffusion Policy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05855v1-abstract-full').style.display = 'none'; document.getElementById('2502.05855v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The webpage is at https://dex-vla.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05400">arXiv:2502.05400</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Noise Preference Optimization for LLM Self-Improvement via Synthetic Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haoyan Yang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+T">Ting Hua</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shangqian Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Binfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zheng Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jie Xu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Hongxia Jin</a>, <a href="/search/cs?searchtype=author&query=Srinivasan%2C+V">Vijay Srinivasan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05400v2-abstract-short" style="display: inline;"> Although LLMs have achieved significant success, their reliance on large volumes of human-annotated data has limited their potential for further scaling. In this situation, utilizing self-generated synthetic data has become crucial for fine-tuning LLMs without extensive human annotation. However, current methods often fail to ensure consistent improvements across iterations, with performance stagn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05400v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05400v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05400v2-abstract-full" style="display: none;"> Although LLMs have achieved significant success, their reliance on large volumes of human-annotated data has limited their potential for further scaling. In this situation, utilizing self-generated synthetic data has become crucial for fine-tuning LLMs without extensive human annotation. However, current methods often fail to ensure consistent improvements across iterations, with performance stagnating after only minimal updates. To overcome these challenges, we introduce Dynamic Noise Preference Optimization (DNPO). DNPO employs a dynamic sample labeling mechanism to construct preference pairs for training and introduces controlled, trainable noise into the preference optimization process. Our approach effectively prevents stagnation and enables continuous improvement. In experiments with Zephyr-7B, DNPO consistently outperforms existing methods, showing an average performance boost of 2.6% across multiple benchmarks. Additionally, DNPO shows a significant improvement in model-generated data quality, with a 29.4% win-loss rate gap compared to the baseline in GPT-4 evaluations. This highlights its effectiveness in enhancing model performance through iterative refinement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05400v2-abstract-full').style.display = 'none'; document.getElementById('2502.05400v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Due to an update in the company's publication approval process, a newly appointed manager has been added to the review workflow. As a result, we need to resubmit the application for approval under the revised process. Therefore, we are temporarily withdrawing this submission until the new approval workflow is completed</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04670">arXiv:2502.04670</a> <span> [<a href="https://arxiv.org/pdf/2502.04670">pdf</a>, <a href="https://arxiv.org/format/2502.04670">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CCS: Controllable and Constrained Sampling with Diffusion Models via Initial Noise Perturbation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+B">Bowen Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zecheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhaoxu Luo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jason Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wei Yuan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Jing Jia</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhengxu Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanyang Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Liyue Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04670v1-abstract-short" style="display: inline;"> Diffusion models have emerged as powerful tools for generative tasks, producing high-quality outputs across diverse domains. However, how the generated data responds to the initial noise perturbation in diffusion models remains under-explored, which hinders understanding the controllability of the sampling process. In this work, we first observe an interesting phenomenon: the relationship between… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04670v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04670v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04670v1-abstract-full" style="display: none;"> Diffusion models have emerged as powerful tools for generative tasks, producing high-quality outputs across diverse domains. However, how the generated data responds to the initial noise perturbation in diffusion models remains under-explored, which hinders understanding the controllability of the sampling process. In this work, we first observe an interesting phenomenon: the relationship between the change of generation outputs and the scale of initial noise perturbation is highly linear through the diffusion ODE sampling. Then we provide both theoretical and empirical study to justify this linearity property of this input-output (noise-generation data) relationship. Inspired by these new insights, we propose a novel Controllable and Constrained Sampling method (CCS) together with a new controller algorithm for diffusion models to sample with desired statistical properties while preserving good sample quality. We perform extensive experiments to compare our proposed sampling approach with other methods on both sampling controllability and sampled data quality. Results show that our CCS method achieves more precisely controlled sampling while maintaining superior sample quality and diversity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04670v1-abstract-full').style.display = 'none'; document.getElementById('2502.04670v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04411">arXiv:2502.04411</a> <span> [<a href="https://arxiv.org/pdf/2502.04411">pdf</a>, <a href="https://arxiv.org/format/2502.04411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Mediator: Memory-efficient LLM Merging with Less Parameter Conflicts and Uncertainty Based Routing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+K">Kunfeng Lai</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xinglin Pan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+P">Peijie Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haolan Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04411v2-abstract-short" style="display: inline;"> Model merging aggregates Large Language Models (LLMs) finetuned on different tasks into a stronger one. However, parameter conflicts between models leads to performance degradation in averaging. While model routing addresses this issue by selecting individual models during inference, it imposes excessive storage and compute costs, and fails to leverage the common knowledge from different models. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04411v2-abstract-full').style.display = 'inline'; document.getElementById('2502.04411v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04411v2-abstract-full" style="display: none;"> Model merging aggregates Large Language Models (LLMs) finetuned on different tasks into a stronger one. However, parameter conflicts between models leads to performance degradation in averaging. While model routing addresses this issue by selecting individual models during inference, it imposes excessive storage and compute costs, and fails to leverage the common knowledge from different models. In this work, we observe that different layers exhibit varying levels of parameter conflicts. Building on this insight, we average layers with minimal parameter conflicts and use a novel task-level expert routing for layers with significant conflicts. To further reduce storage costs, inspired by task arithmetic sparsity, we decouple multiple fine-tuned experts into a dense expert and several sparse experts. Considering the out-of-distribution samples, we select and merge appropriate experts based on the task uncertainty of the input data. We conduct extensive experiments on both LLaMA and Qwen with varying parameter scales, and evaluate on real-world reasoning tasks. Results demonstrate that our method consistently achieves significant performance improvements while requiring less system cost compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04411v2-abstract-full').style.display = 'none'; document.getElementById('2502.04411v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">work in progress. arXiv admin note: text overlap with arXiv:2405.09673 by other authors</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03678">arXiv:2502.03678</a> <span> [<a href="https://arxiv.org/pdf/2502.03678">pdf</a>, <a href="https://arxiv.org/format/2502.03678">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Reflection-Window Decoding: Text Generation with Selective Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zeyu Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenhao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Loka Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiangchen Song</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yunlong Deng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yifan Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&query=Spirtes%2C+P">Peter Spirtes</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03678v1-abstract-short" style="display: inline;"> The autoregressive decoding for text generation in large language models (LLMs), while widely used, is inherently suboptimal due to the lack of a built-in mechanism to perform refinement and/or correction of the generated content. In this paper, we consider optimality in terms of the joint probability over the generated response, when jointly considering all tokens at the same time. We theoretical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03678v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03678v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03678v1-abstract-full" style="display: none;"> The autoregressive decoding for text generation in large language models (LLMs), while widely used, is inherently suboptimal due to the lack of a built-in mechanism to perform refinement and/or correction of the generated content. In this paper, we consider optimality in terms of the joint probability over the generated response, when jointly considering all tokens at the same time. We theoretically characterize the potential deviation of the autoregressively generated response from its globally optimal counterpart that is of the same length. Our analysis suggests that we need to be cautious when noticeable uncertainty arises during text generation, which may signal the sub-optimality of the generation history. To address the pitfall of autoregressive decoding for text generation, we propose an approach that incorporates a sliding reflection window and a pausing criterion, such that refinement and generation can be carried out interchangeably as the decoding proceeds. Our selective refinement framework strikes a balance between efficiency and optimality, and our extensive experimental results demonstrate the effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03678v1-abstract-full').style.display = 'none'; document.getElementById('2502.03678v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02690">arXiv:2502.02690</a> <span> [<a href="https://arxiv.org/pdf/2502.02690">pdf</a>, <a href="https://arxiv.org/format/2502.02690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Controllable Video Generation with Provable Disentanglement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yifan Shen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+P">Peiyuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zijian Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shaoan Xie</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zeyu Tang</a>, <a href="/search/cs?searchtype=author&query=Deka%2C+N">Namrata Deka</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zongfang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02690v1-abstract-short" style="display: inline;"> Controllable video generation remains a significant challenge, despite recent advances in generating high-quality and consistent videos. Most existing methods for controlling video generation treat the video as a whole, neglecting intricate fine-grained spatiotemporal relationships, which limits both control precision and efficiency. In this paper, we propose Controllable Video Generative Adversar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02690v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02690v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02690v1-abstract-full" style="display: none;"> Controllable video generation remains a significant challenge, despite recent advances in generating high-quality and consistent videos. Most existing methods for controlling video generation treat the video as a whole, neglecting intricate fine-grained spatiotemporal relationships, which limits both control precision and efficiency. In this paper, we propose Controllable Video Generative Adversarial Networks (CoVoGAN) to disentangle the video concepts, thus facilitating efficient and independent control over individual concepts. Specifically, following the minimal change principle, we first disentangle static and dynamic latent variables. We then leverage the sufficient change property to achieve component-wise identifiability of dynamic latent variables, enabling independent control over motion and identity. To establish the theoretical foundation, we provide a rigorous analysis demonstrating the identifiability of our approach. Building on these theoretical insights, we design a Temporal Transition Module to disentangle latent dynamics. To enforce the minimal change principle and sufficient change property, we minimize the dimensionality of latent dynamic variables and impose temporal conditional independence. To validate our approach, we integrate this module as a plug-in for GANs. Extensive qualitative and quantitative experiments on various video generation benchmarks demonstrate that our method significantly improves generation quality and controllability across diverse real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02690v1-abstract-full').style.display = 'none'; document.getElementById('2502.02690v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01941">arXiv:2502.01941</a> <span> [<a href="https://arxiv.org/pdf/2502.01941">pdf</a>, <a href="https://arxiv.org/format/2502.01941">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can LLMs Maintain Fundamental Abilities under KV Cache Compression? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+P">Peijie Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zeyu Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiuze Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuming Hu</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01941v1-abstract-short" style="display: inline;"> This paper investigates an under-explored challenge in large language models (LLMs): the impact of KV cache compression methods on LLMs' fundamental capabilities. While existing methods achieve impressive compression ratios on long-context benchmarks, their effects on core model capabilities remain understudied. We present a comprehensive empirical study evaluating prominent KV cache compression m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01941v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01941v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01941v1-abstract-full" style="display: none;"> This paper investigates an under-explored challenge in large language models (LLMs): the impact of KV cache compression methods on LLMs' fundamental capabilities. While existing methods achieve impressive compression ratios on long-context benchmarks, their effects on core model capabilities remain understudied. We present a comprehensive empirical study evaluating prominent KV cache compression methods across diverse tasks, spanning world knowledge, commonsense reasoning, arithmetic reasoning, code generation, safety, and long-context understanding and generation.Our analysis reveals that KV cache compression methods exhibit task-specific performance degradation. Arithmetic reasoning tasks prove particularly sensitive to aggressive compression, with different methods showing performance drops of $17.4\%$-$43.3\%$. Notably, the DeepSeek R1 Distill model exhibits more robust compression tolerance compared to instruction-tuned models, showing only $9.67\%$-$25.53\%$ performance degradation. Based on our analysis of attention patterns and cross-task compression performance, we propose ShotKV, a novel compression approach that distinctly handles prefill and decoding phases while maintaining shot-level semantic coherence. Empirical results show that ShotKV achieves $9\%$-$18\%$ performance improvements on long-context generation tasks under aggressive compression ratios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01941v1-abstract-full').style.display = 'none'; document.getElementById('2502.01941v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00665">arXiv:2502.00665</a> <span> [<a href="https://arxiv.org/pdf/2502.00665">pdf</a>, <a href="https://arxiv.org/format/2502.00665">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-Modal Synergies: Unveiling the Potential of Motion-Aware Fusion Networks in Handling Dynamic and Static ReID Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+F">Fuxi Ling</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongye Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+G">Guoqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hong Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhihao Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00665v1-abstract-short" style="display: inline;"> Navigating the complexities of person re-identification (ReID) in varied surveillance scenarios, particularly when occlusions occur, poses significant challenges. We introduce an innovative Motion-Aware Fusion (MOTAR-FUSE) network that utilizes motion cues derived from static imagery to significantly enhance ReID capabilities. This network incorporates a dual-input visual adapter capable of proces… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00665v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00665v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00665v1-abstract-full" style="display: none;"> Navigating the complexities of person re-identification (ReID) in varied surveillance scenarios, particularly when occlusions occur, poses significant challenges. We introduce an innovative Motion-Aware Fusion (MOTAR-FUSE) network that utilizes motion cues derived from static imagery to significantly enhance ReID capabilities. This network incorporates a dual-input visual adapter capable of processing both images and videos, thereby facilitating more effective feature extraction. A unique aspect of our approach is the integration of a motion consistency task, which empowers the motion-aware transformer to adeptly capture the dynamics of human motion. This technique substantially improves the recognition of features in scenarios where occlusions are prevalent, thereby advancing the ReID process. Our comprehensive evaluations across multiple ReID benchmarks, including holistic, occluded, and video-based scenarios, demonstrate that our MOTAR-FUSE network achieves superior performance compared to existing approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00665v1-abstract-full').style.display = 'none'; document.getElementById('2502.00665v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00299">arXiv:2502.00299</a> <span> [<a href="https://arxiv.org/pdf/2502.00299">pdf</a>, <a href="https://arxiv.org/format/2502.00299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ChunkKV: Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+P">Peijie Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zeyu Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuming Hu</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00299v1-abstract-short" style="display: inline;"> To reduce memory costs in long-context inference with Large Language Models (LLMs), many recent works focus on compressing the key-value (KV) cache of different tokens. However, we identify that the previous KV cache compression methods measure token importance individually, neglecting the dependency between different tokens in the real-world language characterics. In light of this, we introduce C… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00299v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00299v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00299v1-abstract-full" style="display: none;"> To reduce memory costs in long-context inference with Large Language Models (LLMs), many recent works focus on compressing the key-value (KV) cache of different tokens. However, we identify that the previous KV cache compression methods measure token importance individually, neglecting the dependency between different tokens in the real-world language characterics. In light of this, we introduce ChunkKV, grouping the tokens in a chunk as a basic compressing unit, and retaining the most informative semantic chunks while discarding the less important ones. Furthermore, observing that ChunkKV exhibits higher similarity in the preserved indices across different layers, we propose layer-wise index reuse to further reduce computational overhead. We evaluated ChunkKV on cutting-edge long-context benchmarks including LongBench and Needle-In-A-HayStack, as well as the GSM8K and JailbreakV in-context learning benchmark. Our experiments with instruction tuning and multi-step reasoning (O1 and R1) LLMs, achieve up to 10\% performance improvement under aggressive compression ratios compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00299v1-abstract-full').style.display = 'none'; document.getElementById('2502.00299v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17295">arXiv:2501.17295</a> <span> [<a href="https://arxiv.org/pdf/2501.17295">pdf</a>, <a href="https://arxiv.org/format/2501.17295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Hallucinated Translations in Large Language Models with Hallucination-focused Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zilu Tang</a>, <a href="/search/cs?searchtype=author&query=Chatterjee%2C+R">Rajen Chatterjee</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sarthak Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17295v1-abstract-short" style="display: inline;"> Machine Translation (MT) is undergoing a paradigm shift, with systems based on fine-tuned large language models (LLM) becoming increasingly competitive with traditional encoder-decoder models trained specifically for translation tasks. However, LLM-based systems are at a higher risk of generating hallucinations, which can severely undermine user's trust and safety. Most prior research on hallucina… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17295v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17295v1-abstract-full" style="display: none;"> Machine Translation (MT) is undergoing a paradigm shift, with systems based on fine-tuned large language models (LLM) becoming increasingly competitive with traditional encoder-decoder models trained specifically for translation tasks. However, LLM-based systems are at a higher risk of generating hallucinations, which can severely undermine user's trust and safety. Most prior research on hallucination mitigation focuses on traditional MT models, with solutions that involve post-hoc mitigation - detecting hallucinated translations and re-translating them. While effective, this approach introduces additional complexity in deploying extra tools in production and also increases latency. To address these limitations, we propose a method that intrinsically learns to mitigate hallucinations during the model training phase. Specifically, we introduce a data creation framework to generate hallucination focused preference datasets. Fine-tuning LLMs on these preference datasets reduces the hallucination rate by an average of 96% across five language pairs, while preserving overall translation quality. In a zero-shot setting our approach reduces hallucinations by 89% on an average across three unseen target languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17295v1-abstract-full').style.display = 'none'; document.getElementById('2501.17295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025 Main Conference Long paper (9 pages)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NAACL 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15429">arXiv:2501.15429</a> <span> [<a href="https://arxiv.org/pdf/2501.15429">pdf</a>, <a href="https://arxiv.org/format/2501.15429">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3701551.3703528">10.1145/3701551.3703528 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> An Aspect Performance-aware Hypergraph Neural Network for Review-based Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junrui Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tong Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zifang Tang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuan Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15429v1-abstract-short" style="display: inline;"> Online reviews allow consumers to provide detailed feedback on various aspects of items. Existing methods utilize these aspects to model users' fine-grained preferences for specific item features through graph neural networks. We argue that the performance of items on different aspects is important for making precise recommendations, which has not been taken into account by existing approaches, du… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15429v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15429v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15429v1-abstract-full" style="display: none;"> Online reviews allow consumers to provide detailed feedback on various aspects of items. Existing methods utilize these aspects to model users' fine-grained preferences for specific item features through graph neural networks. We argue that the performance of items on different aspects is important for making precise recommendations, which has not been taken into account by existing approaches, due to lack of data. In this paper, we propose an aspect performance-aware hypergraph neural network (APH) for the review-based recommendation, which learns the performance of items from the conflicting sentiment polarity of user reviews. Specifically, APH comprehensively models the relationships among users, items, aspects, and sentiment polarity by systematically constructing an aspect hypergraph based on user reviews. In addition, APH aggregates aspects representing users and items by employing an aspect performance-aware hypergraph aggregation method. It aggregates the sentiment polarities from multiple users by jointly considering user preferences and the semantics of their sentiments, determining the weights of sentiment polarities to infer the performance of items on various aspects. Such performances are then used as weights to aggregate neighboring aspects. Experiments on six real-world datasets demonstrate that APH improves MSE, Precision@5, and Recall@5 by an average of 2.30%, 4.89%, and 1.60% over the best baseline. The source code and data are available at https://github.com/dianziliu/APH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15429v1-abstract-full').style.display = 'none'; document.getElementById('2501.15429v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, accepted by WSDM'25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15316">arXiv:2501.15316</a> <span> [<a href="https://arxiv.org/pdf/2501.15316">pdf</a>, <a href="https://arxiv.org/format/2501.15316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ToMoE: Converting Dense Large Language Models to Mixture-of-Experts through Dynamic Structural Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shangqian Gao</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+T">Ting Hua</a>, <a href="/search/cs?searchtype=author&query=Shirkavand%2C+R">Reza Shirkavand</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chi-Heng Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhen Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengao Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Longge Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fangyi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ganjdanesh%2C+A">Alireza Ganjdanesh</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+L">Lou Qian</a>, <a href="/search/cs?searchtype=author&query=Jie%2C+X">Xu Jie</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+Y">Yen-Chang Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15316v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable abilities in tackling a wide range of complex tasks. However, their huge computational and memory costs raise significant challenges in deploying these models on resource-constrained devices or efficiently serving them. Prior approaches have attempted to alleviate these problems by permanently removing less important model structures, yet t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15316v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15316v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15316v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable abilities in tackling a wide range of complex tasks. However, their huge computational and memory costs raise significant challenges in deploying these models on resource-constrained devices or efficiently serving them. Prior approaches have attempted to alleviate these problems by permanently removing less important model structures, yet these methods often result in substantial performance degradation due to the permanent deletion of model parameters. In this work, we tried to mitigate this issue by reducing the number of active parameters without permanently removing them. Specifically, we introduce a differentiable dynamic pruning method that pushes dense models to maintain a fixed number of active parameters by converting their MLP layers into a Mixture of Experts (MoE) architecture. Our method, even without fine-tuning, consistently outperforms previous structural pruning techniques across diverse model families, including Phi-2, LLaMA-2, LLaMA-3, and Qwen-2.5. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15316v1-abstract-full').style.display = 'none'; document.getElementById('2501.15316v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15032">arXiv:2501.15032</a> <span> [<a href="https://arxiv.org/pdf/2501.15032">pdf</a>, <a href="https://arxiv.org/format/2501.15032">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Stealthy Voice Eavesdropping with Acoustic Metamaterials: Unraveling a New Privacy Threat </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ning%2C+Z">Zhiyuan Ning</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhanyong Tang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Juan He</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+W">Weizhi Meng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuntian Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15032v1-abstract-short" style="display: inline;"> We present SuperEar, a novel privacy threat based on acoustic metamaterials. Unlike previous research, SuperEar can surreptitiously track and eavesdrop on the phone calls of a moving outdoor target from a safe distance. To design this attack, SuperEar overcomes the challenges faced by traditional acoustic metamaterials, including low low-frequency gain and audio distortion during reconstruction. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15032v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15032v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15032v1-abstract-full" style="display: none;"> We present SuperEar, a novel privacy threat based on acoustic metamaterials. Unlike previous research, SuperEar can surreptitiously track and eavesdrop on the phone calls of a moving outdoor target from a safe distance. To design this attack, SuperEar overcomes the challenges faced by traditional acoustic metamaterials, including low low-frequency gain and audio distortion during reconstruction. It successfully magnifies the speech signal by approximately 20 times, allowing the sound to be captured from the earpiece of the target phone. In addition, SuperEar optimizes the trade-off between the number and size of acoustic metamaterials, improving the portability and concealability of the interceptor while ensuring effective interception performance. This makes it highly suitable for outdoor tracking and eavesdropping scenarios. Through extensive experimentation, we have evaluated SuperEar and our results show that it can achieve an eavesdropping accuracy of over 80% within a range of 4.5 meters in the aforementioned scenario, thus validating its great potential in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15032v1-abstract-full').style.display = 'none'; document.getElementById('2501.15032v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15031">arXiv:2501.15031</a> <span> [<a href="https://arxiv.org/pdf/2501.15031">pdf</a>, <a href="https://arxiv.org/format/2501.15031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> A Portable and Stealthy Inaudible Voice Attack Based on Acoustic Metamaterials </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ning%2C+Z">Zhiyuan Ning</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Juan He</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhanyong Tang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Weihang Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaojiang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15031v1-abstract-short" style="display: inline;"> We present METAATTACK, the first approach to leverage acoustic metamaterials for inaudible attacks for voice control systems. Compared to the state-of-the-art inaudible attacks requiring complex and large speaker setups, METAATTACK achieves a longer attacking range and higher accuracy using a compact, portable device small enough to be put into a carry bag. These improvements in portability and st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15031v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15031v1-abstract-full" style="display: none;"> We present METAATTACK, the first approach to leverage acoustic metamaterials for inaudible attacks for voice control systems. Compared to the state-of-the-art inaudible attacks requiring complex and large speaker setups, METAATTACK achieves a longer attacking range and higher accuracy using a compact, portable device small enough to be put into a carry bag. These improvements in portability and stealth have led to the practical applicability of inaudible attacks and their adaptation to a wider range of scenarios. We demonstrate how the recent advancement in metamaterials can be utilized to design a voice attack system with carefully selected implementation parameters and commercial off-the-shelf components. We showcase that METAATTACK can be used to launch inaudible attacks for representative voice-controlled personal assistants, including Siri, Alexa, Google Assistant, XiaoAI, and Xiaoyi. The average word accuracy of all assistants is 76%, with a range of 8.85 m. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15031v1-abstract-full').style.display = 'none'; document.getElementById('2501.15031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14588">arXiv:2501.14588</a> <span> [<a href="https://arxiv.org/pdf/2501.14588">pdf</a>, <a href="https://arxiv.org/format/2501.14588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data Assetization via Resources-decoupled Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jianzhe Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+F">Feida Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lingyan He</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zixin Tang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mingce Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shiyu Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+G">Guibing Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14588v2-abstract-short" style="display: inline;"> With the development of the digital economy, data is increasingly recognized as an essential resource for both work and life. However, due to privacy concerns, data owners tend to maximize the value of data through the circulation of information rather than direct data transfer. Federated learning (FL) provides an effective approach to collaborative training models while preserving privacy. Howeve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14588v2-abstract-full').style.display = 'inline'; document.getElementById('2501.14588v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14588v2-abstract-full" style="display: none;"> With the development of the digital economy, data is increasingly recognized as an essential resource for both work and life. However, due to privacy concerns, data owners tend to maximize the value of data through the circulation of information rather than direct data transfer. Federated learning (FL) provides an effective approach to collaborative training models while preserving privacy. However, as model parameters and training data grow, there are not only real differences in data resources between different data owners, but also mismatches between data and computing resources. These challenges lead to inadequate collaboration among data owners, compute centers, and model owners, reducing the global utility of the three parties and the effectiveness of data assetization. In this work, we first propose a framework for resource-decoupled FL involving three parties. Then, we design a Tripartite Stackelberg Model and theoretically analyze the Stackelberg-Nash equilibrium (SNE) for participants to optimize global utility. Next, we propose the Quality-aware Dynamic Resources-decoupled FL algorithm (QD-RDFL), in which we derive and solve the optimal strategies of all parties to achieve SNE using backward induction. We also design a dynamic optimization mechanism to improve the optimal strategy profile by evaluating the contribution of data quality from data owners to the global model during real training. Finally, our extensive experiments demonstrate that our method effectively encourages the linkage of the three parties involved, maximizing the global utility and value of data assets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14588v2-abstract-full').style.display = 'none'; document.getElementById('2501.14588v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14492">arXiv:2501.14492</a> <span> [<a href="https://arxiv.org/pdf/2501.14492">pdf</a>, <a href="https://arxiv.org/format/2501.14492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RealCritic: Towards Effectiveness-Driven Evaluation of Language Model Critiques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhengyang Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziniu Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhenyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tian Ding</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoyu Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dayiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14492v1-abstract-short" style="display: inline;"> Critiques are important for enhancing the performance of Large Language Models (LLMs), enabling both self-improvement and constructive feedback for others by identifying flaws and suggesting improvements. However, evaluating the critique capabilities of LLMs presents a significant challenge due to the open-ended nature of the task. In this work, we introduce a new benchmark designed to assess the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14492v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14492v1-abstract-full" style="display: none;"> Critiques are important for enhancing the performance of Large Language Models (LLMs), enabling both self-improvement and constructive feedback for others by identifying flaws and suggesting improvements. However, evaluating the critique capabilities of LLMs presents a significant challenge due to the open-ended nature of the task. In this work, we introduce a new benchmark designed to assess the critique capabilities of LLMs. Unlike existing benchmarks, which typically function in an open-loop fashion, our approach employs a closed-loop methodology that evaluates the quality of corrections generated from critiques. Moreover, the benchmark incorporates features such as self-critique, cross-critique, and iterative critique, which are crucial for distinguishing the abilities of advanced reasoning models from more classical ones. We implement this benchmark using eight challenging reasoning tasks. We have several interesting findings. First, despite demonstrating comparable performance in direct chain-of-thought generation, classical LLMs significantly lag behind the advanced reasoning-based model o1-mini across all critique scenarios. Second, in self-critique and iterative critique settings, classical LLMs may even underperform relative to their baseline capabilities. We hope that this benchmark will serve as a valuable resource to guide future advancements. The code and data are available at \url{https://github.com/tangzhy/RealCritic}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14492v1-abstract-full').style.display = 'none'; document.getElementById('2501.14492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13629">arXiv:2501.13629</a> <span> [<a href="https://arxiv.org/pdf/2501.13629">pdf</a>, <a href="https://arxiv.org/format/2501.13629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Sigma: Differential Rescaling of Query, Key and Value for Efficient Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhenghao Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zihao Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yeyun Gong</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Y">Ying Xin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyue Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kailai Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yu Yan</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiao Liang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shuai Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiming Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zheheng Luo</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lei Qu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xuan Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaoxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yuqing Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Feiyang Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuting Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yasen Hu</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+H">Hao Ni</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Binyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guoshuai Zhao</a> , et al. (9 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13629v2-abstract-short" style="display: inline;"> We introduce Sigma, an efficient large language model specialized for the system domain, empowered by a novel architecture including DiffQKV attention, and pre-trained on our meticulously collected system domain data. DiffQKV attention significantly enhances the inference efficiency of Sigma by optimizing the Query (Q), Key (K), and Value (V) components in the attention mechanism differentially, b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13629v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13629v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13629v2-abstract-full" style="display: none;"> We introduce Sigma, an efficient large language model specialized for the system domain, empowered by a novel architecture including DiffQKV attention, and pre-trained on our meticulously collected system domain data. DiffQKV attention significantly enhances the inference efficiency of Sigma by optimizing the Query (Q), Key (K), and Value (V) components in the attention mechanism differentially, based on their varying impacts on the model performance and efficiency indicators. Specifically, we (1) conduct extensive experiments that demonstrate the model's varying sensitivity to the compression of K and V components, leading to the development of differentially compressed KV, and (2) propose augmented Q to expand the Q head dimension, which enhances the model's representation capacity with minimal impacts on the inference speed. Rigorous theoretical and empirical analyses reveal that DiffQKV attention significantly enhances efficiency, achieving up to a 33.36% improvement in inference speed over the conventional grouped-query attention (GQA) in long-context scenarios. We pre-train Sigma on 6T tokens from various sources, including 19.5B system domain data that we carefully collect and 1T tokens of synthesized and rewritten data. In general domains, Sigma achieves comparable performance to other state-of-arts models. In the system domain, we introduce the first comprehensive benchmark AIMicius, where Sigma demonstrates remarkable performance across all tasks, significantly outperforming GPT-4 with an absolute improvement up to 52.5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13629v2-abstract-full').style.display = 'none'; document.getElementById('2501.13629v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10714">arXiv:2501.10714</a> <span> [<a href="https://arxiv.org/pdf/2501.10714">pdf</a>, <a href="https://arxiv.org/format/2501.10714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3669940.3707272">10.1145/3669940.3707272 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> FSMoE: A Flexible and Scalable Training System for Sparse Mixture-of-Experts Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xinglin Pan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wenxiang Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lin Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Shaohuai Shi</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10714v1-abstract-short" style="display: inline;"> Recent large language models (LLMs) have tended to leverage sparsity to reduce computations, employing the sparsely activated mixture-of-experts (MoE) technique. MoE introduces four modules, including token routing, token communication, expert computation, and expert parallelism, that impact model quality and training efficiency. To enable versatile usage of MoE models, we introduce FSMoE, a flexi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10714v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10714v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10714v1-abstract-full" style="display: none;"> Recent large language models (LLMs) have tended to leverage sparsity to reduce computations, employing the sparsely activated mixture-of-experts (MoE) technique. MoE introduces four modules, including token routing, token communication, expert computation, and expert parallelism, that impact model quality and training efficiency. To enable versatile usage of MoE models, we introduce FSMoE, a flexible training system optimizing task scheduling with three novel techniques: 1) Unified abstraction and online profiling of MoE modules for task scheduling across various MoE implementations. 2) Co-scheduling intra-node and inter-node communications with computations to minimize communication overheads. 3) To support near-optimal task scheduling, we design an adaptive gradient partitioning method for gradient aggregation and a schedule to adaptively pipeline communications and computations. We conduct extensive experiments with configured MoE layers and real-world MoE models on two GPU clusters. Experimental results show that 1) our FSMoE supports four popular types of MoE routing functions and is more efficient than existing implementations (with up to a 1.42$\times$ speedup), and 2) FSMoE outperforms the state-of-the-art MoE training systems (DeepSpeed-MoE and Tutel) by 1.18$\times$-1.22$\times$ on 1458 MoE layers and 1.19$\times$-3.01$\times$ on real-world MoE models based on GPT-2 and Mixtral using a popular routing function. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10714v1-abstract-full').style.display = 'none'; document.getElementById('2501.10714v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09178">arXiv:2501.09178</a> <span> [<a href="https://arxiv.org/pdf/2501.09178">pdf</a>, <a href="https://arxiv.org/format/2501.09178">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Graph Representation Learning with Localized Topological Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zuoyu Yan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qi Zhao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Ze Ye</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tengfei Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Liangcai Gao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhi Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yusu Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09178v1-abstract-short" style="display: inline;"> Representation learning on graphs is a fundamental problem that can be crucial in various tasks. Graph neural networks, the dominant approach for graph representation learning, are limited in their representation power. Therefore, it can be beneficial to explicitly extract and incorporate high-order topological and geometric information into these models. In this paper, we propose a principled app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09178v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09178v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09178v1-abstract-full" style="display: none;"> Representation learning on graphs is a fundamental problem that can be crucial in various tasks. Graph neural networks, the dominant approach for graph representation learning, are limited in their representation power. Therefore, it can be beneficial to explicitly extract and incorporate high-order topological and geometric information into these models. In this paper, we propose a principled approach to extract the rich connectivity information of graphs based on the theory of persistent homology. Our method utilizes the topological features to enhance the representation learning of graph neural networks and achieve state-of-the-art performance on various node classification and link prediction benchmarks. We also explore the option of end-to-end learning of the topological features, i.e., treating topological computation as a differentiable operator during learning. Our theoretical analysis and empirical study provide insights and potential guidelines for employing topological features in graph learning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09178v1-abstract-full').style.display = 'none'; document.getElementById('2501.09178v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in JMLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08579">arXiv:2501.08579</a> <span> [<a href="https://arxiv.org/pdf/2501.08579">pdf</a>, <a href="https://arxiv.org/format/2501.08579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> What Limits LLM-based Human Simulation: LLMs or Our Design? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiaying Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bingqiao Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nuo Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bingsheng He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08579v1-abstract-short" style="display: inline;"> We argue that advancing LLM-based human simulation requires addressing both LLM's inherent limitations and simulation framework design challenges. Recent studies have revealed significant gaps between LLM-based human simulations and real-world observations, highlighting these dual challenges. To address these gaps, we present a comprehensive analysis of LLM limitations and our design issues, propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08579v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08579v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08579v1-abstract-full" style="display: none;"> We argue that advancing LLM-based human simulation requires addressing both LLM's inherent limitations and simulation framework design challenges. Recent studies have revealed significant gaps between LLM-based human simulations and real-world observations, highlighting these dual challenges. To address these gaps, we present a comprehensive analysis of LLM limitations and our design issues, proposing targeted solutions for both aspects. Furthermore, we explore future directions that address both challenges simultaneously, particularly in data collection, LLM generation, and evaluation. To support further research in this field, we provide a curated collection of LLM-based human simulation resources.\footnote{https://github.com/Persdre/llm-human-simulation} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08579v1-abstract-full').style.display = 'none'; document.getElementById('2501.08579v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05727">arXiv:2501.05727</a> <span> [<a href="https://arxiv.org/pdf/2501.05727">pdf</a>, <a href="https://arxiv.org/format/2501.05727">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enabling Scalable Oversight via Self-Evolving Critic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhengyang Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziniu Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhenyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tian Ding</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoyu Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dayiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05727v1-abstract-short" style="display: inline;"> Despite their remarkable performance, the development of Large Language Models (LLMs) faces a critical challenge in scalable oversight: providing effective feedback for tasks where human evaluation is difficult or where LLMs outperform humans. While there is growing interest in using LLMs for critique, current approaches still rely on human annotations or more powerful models, leaving the issue of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05727v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05727v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05727v1-abstract-full" style="display: none;"> Despite their remarkable performance, the development of Large Language Models (LLMs) faces a critical challenge in scalable oversight: providing effective feedback for tasks where human evaluation is difficult or where LLMs outperform humans. While there is growing interest in using LLMs for critique, current approaches still rely on human annotations or more powerful models, leaving the issue of enhancing critique capabilities without external supervision unresolved. We introduce SCRIT (Self-evolving CRITic), a framework that enables genuine self-evolution of critique abilities. Technically, SCRIT self-improves by training on synthetic data, generated by a contrastive-based self-critic that uses reference solutions for step-by-step critique, and a self-validation mechanism that ensures critique quality through correction outcomes. Implemented with Qwen2.5-72B-Instruct, one of the most powerful LLMs, SCRIT achieves up to a 10.3\% improvement on critique-correction and error identification benchmarks. Our analysis reveals that SCRIT's performance scales positively with data and model size, outperforms alternative approaches, and benefits critically from its self-validation component. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05727v1-abstract-full').style.display = 'none'; document.getElementById('2501.05727v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02807">arXiv:2501.02807</a> <span> [<a href="https://arxiv.org/pdf/2501.02807">pdf</a>, <a href="https://arxiv.org/format/2501.02807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AE-NeRF: Augmenting Event-Based Neural Radiance Fields for Non-ideal Conditions and Larger Scene </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chaoran Feng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wangbo Yu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xinhua Cheng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenyu Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junwu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Li Yuan</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yonghong Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02807v2-abstract-short" style="display: inline;"> Compared to frame-based methods, computational neuromorphic imaging using event cameras offers significant advantages, such as minimal motion blur, enhanced temporal resolution, and high dynamic range. The multi-view consistency of Neural Radiance Fields combined with the unique benefits of event cameras, has spurred recent research into reconstructing NeRF from data captured by moving event camer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02807v2-abstract-full').style.display = 'inline'; document.getElementById('2501.02807v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02807v2-abstract-full" style="display: none;"> Compared to frame-based methods, computational neuromorphic imaging using event cameras offers significant advantages, such as minimal motion blur, enhanced temporal resolution, and high dynamic range. The multi-view consistency of Neural Radiance Fields combined with the unique benefits of event cameras, has spurred recent research into reconstructing NeRF from data captured by moving event cameras. While showing impressive performance, existing methods rely on ideal conditions with the availability of uniform and high-quality event sequences and accurate camera poses, and mainly focus on the object level reconstruction, thus limiting their practical applications. In this work, we propose AE-NeRF to address the challenges of learning event-based NeRF from non-ideal conditions, including non-uniform event sequences, noisy poses, and various scales of scenes. Our method exploits the density of event streams and jointly learn a pose correction module with an event-based NeRF (e-NeRF) framework for robust 3D reconstruction from inaccurate camera poses. To generalize to larger scenes, we propose hierarchical event distillation with a proposal e-NeRF network and a vanilla e-NeRF network to resample and refine the reconstruction process. We further propose an event reconstruction loss and a temporal loss to improve the view consistency of the reconstructed scene. We established a comprehensive benchmark that includes large-scale scenes to simulate practical non-ideal conditions, incorporating both synthetic and challenging real-world event datasets. The experimental results show that our method achieves a new state-of-the-art in event-based 3D reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02807v2-abstract-full').style.display = 'none'; document.getElementById('2501.02807v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20451">arXiv:2412.20451</a> <span> [<a href="https://arxiv.org/pdf/2412.20451">pdf</a>, <a href="https://arxiv.org/format/2412.20451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Improving Vision-Language-Action Models via Chain-of-Affordance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinming Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yichen Zhu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhibin Tang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Junjie Wen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minjie Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengmeng Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+R">Ran Cheng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yaxin Peng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Feifei Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20451v1-abstract-short" style="display: inline;"> Robot foundation models, particularly Vision-Language-Action (VLA) models, have garnered significant attention for their ability to enhance robot policy learning, greatly improving robot generalization and robustness. OpenAI recent model, o1, showcased impressive capabilities in solving complex problems by utilizing extensive reasoning chains. This prompts an important question: can robot models a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20451v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20451v1-abstract-full" style="display: none;"> Robot foundation models, particularly Vision-Language-Action (VLA) models, have garnered significant attention for their ability to enhance robot policy learning, greatly improving robot generalization and robustness. OpenAI recent model, o1, showcased impressive capabilities in solving complex problems by utilizing extensive reasoning chains. This prompts an important question: can robot models achieve better performance in multi-task, complex environments by reviewing prior observations and then providing task-specific reasoning to guide action prediction? In this paper, we introduce \textbf{Chain-of-Affordance (CoA)}, a novel approach to scaling robot models by incorporating reasoning in the format of sequential robot affordances to facilitate task completion. Specifically, we prompt the model to consider the following four types of affordances before taking action: a) object affordance - what object to manipulate and where it is; b) grasp affordance - the specific object part to grasp; c) spatial affordance - the optimal space to place the object; and d) movement affordance - the collision-free path for movement. By integrating this knowledge into the policy model, the robot gains essential context, allowing it to act with increased precision and robustness during inference. Our experiments demonstrate that CoA achieves superior performance than state-of-the-art robot foundation models, such as OpenVLA and Octo. Additionally, CoA shows strong generalization to unseen object poses, identifies free space, and avoids obstacles in novel environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20451v1-abstract-full').style.display = 'none'; document.getElementById('2412.20451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project webpage is available at https://chain-of-affordance.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17729">arXiv:2412.17729</a> <span> [<a href="https://arxiv.org/pdf/2412.17729">pdf</a>, <a href="https://arxiv.org/format/2412.17729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Chumor 2.0: Towards Benchmarking Chinese Humor Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+R">Ruiqi He</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yushu He</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Longju Bai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiarui Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhenjie Sun</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zenghao Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">He Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+H">Hanchen Xia</a>, <a href="/search/cs?searchtype=author&query=Mihalcea%2C+R">Rada Mihalcea</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+N">Naihao Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17729v1-abstract-short" style="display: inline;"> Existing humor datasets and evaluations predominantly focus on English, leaving limited resources for culturally nuanced humor in non-English languages like Chinese. To address this gap, we construct Chumor, the first Chinese humor explanation dataset that exceeds the size of existing humor datasets. Chumor is sourced from Ruo Zhi Ba, a Chinese Reddit-like platform known for sharing intellectually… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17729v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17729v1-abstract-full" style="display: none;"> Existing humor datasets and evaluations predominantly focus on English, leaving limited resources for culturally nuanced humor in non-English languages like Chinese. To address this gap, we construct Chumor, the first Chinese humor explanation dataset that exceeds the size of existing humor datasets. Chumor is sourced from Ruo Zhi Ba, a Chinese Reddit-like platform known for sharing intellectually challenging and culturally specific jokes. We test ten LLMs through direct and chain-of-thought prompting, revealing that Chumor poses significant challenges to existing LLMs, with their accuracy slightly above random and far below human. In addition, our analysis highlights that human-annotated humor explanations are significantly better than those generated by GPT-4o and ERNIE-4-turbo. We release Chumor at https://huggingface.co/datasets/dnaihao/Chumor, our project page is at https://dnaihao.github.io/Chumor-dataset/, our leaderboard is at https://huggingface.co/spaces/dnaihao/Chumor, and our codebase is at https://github.com/dnaihao/Chumor-dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17729v1-abstract-full').style.display = 'none'; document.getElementById('2412.17729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2406.12754</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16839">arXiv:2412.16839</a> <span> [<a href="https://arxiv.org/pdf/2412.16839">pdf</a>, <a href="https://arxiv.org/format/2412.16839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Human-Guided Image Generation for Expanding Small-Scale Training Image Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Changjian Chen</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+F">Fei Lv</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Y">Yalong Guan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengcheng Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shengjie Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhuo Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16839v2-abstract-short" style="display: inline;"> The performance of computer vision models in certain real-world applications (e.g., rare wildlife observation) is limited by the small number of available images. Expanding datasets using pre-trained generative models is an effective way to address this limitation. However, since the automatic generation process is uncontrollable, the generated images are usually limited in diversity, and some of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16839v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16839v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16839v2-abstract-full" style="display: none;"> The performance of computer vision models in certain real-world applications (e.g., rare wildlife observation) is limited by the small number of available images. Expanding datasets using pre-trained generative models is an effective way to address this limitation. However, since the automatic generation process is uncontrollable, the generated images are usually limited in diversity, and some of them are undesired. In this paper, we propose a human-guided image generation method for more controllable dataset expansion. We develop a multi-modal projection method with theoretical guarantees to facilitate the exploration of both the original and generated images. Based on the exploration, users refine the prompts and re-generate images for better performance. Since directly refining the prompts is challenging for novice users, we develop a sample-level prompt refinement method to make it easier. With this method, users only need to provide sample-level feedback (e.g., which samples are undesired) to obtain better prompts. The effectiveness of our method is demonstrated through the quantitative evaluation of the multi-modal projection method, improved model performance in the case study for both classification and object detection tasks, and positive feedback from the experts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16839v2-abstract-full').style.display = 'none'; document.getElementById('2412.16839v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TVCG2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16537">arXiv:2412.16537</a> <span> [<a href="https://arxiv.org/pdf/2412.16537">pdf</a>, <a href="https://arxiv.org/format/2412.16537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Private Large Transformers Inference through Fine-grained Collaborative Computation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuntian Chen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhanyong Tang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+T">Tianpei Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bingsheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhiying Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16537v2-abstract-short" style="display: inline;"> Homomorphic encryption (HE) and secret sharing (SS) enable computations on encrypted data, providing significant privacy benefits for large transformer-based models (TBM) in sensitive sectors like medicine and finance. However, private TBM inference incurs significant costs due to the coarse-grained application of HE and SS. We present FASTLMPI, a new approach to accelerate private TBM inference t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16537v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16537v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16537v2-abstract-full" style="display: none;"> Homomorphic encryption (HE) and secret sharing (SS) enable computations on encrypted data, providing significant privacy benefits for large transformer-based models (TBM) in sensitive sectors like medicine and finance. However, private TBM inference incurs significant costs due to the coarse-grained application of HE and SS. We present FASTLMPI, a new approach to accelerate private TBM inference through fine-grained computation optimization. Specifically, through the fine-grained co-design of homomorphic encryption and secret sharing, FASTLMPI achieves efficient protocols for matrix multiplication, SoftMax, LayerNorm, and GeLU. In addition, FASTLMPI introduces a precise segmented approximation technique for differentiable non-linear, improving its fitting accuracy while maintaining a low polynomial degree. Compared to solution BOLT (S&P'24), FASTLMPI shows a remarkable 54% to 64% decrease in runtime and an impressive 72.2% reduction in communication costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16537v2-abstract-full').style.display = 'none'; document.getElementById('2412.16537v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 Pages (with 4 Pages appendix; 14 Figures)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16243">arXiv:2412.16243</a> <span> [<a href="https://arxiv.org/pdf/2412.16243">pdf</a>, <a href="https://arxiv.org/format/2412.16243">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bag of Tricks for Multimodal AutoML with Image, Text, and Tabular Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhiqiang Tang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zihan Zhong</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a>, <a href="/search/cs?searchtype=author&query=Friedland%2C+G">Gerald Friedland</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16243v1-abstract-short" style="display: inline;"> This paper studies the best practices for automatic machine learning (AutoML). While previous AutoML efforts have predominantly focused on unimodal data, the multimodal aspect remains under-explored. Our study delves into classification and regression problems involving flexible combinations of image, text, and tabular data. We curate a benchmark comprising 22 multimodal datasets from diverse real… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16243v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16243v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16243v1-abstract-full" style="display: none;"> This paper studies the best practices for automatic machine learning (AutoML). While previous AutoML efforts have predominantly focused on unimodal data, the multimodal aspect remains under-explored. Our study delves into classification and regression problems involving flexible combinations of image, text, and tabular data. We curate a benchmark comprising 22 multimodal datasets from diverse real-world applications, encompassing all 4 combinations of the 3 modalities. Across this benchmark, we scrutinize design choices related to multimodal fusion strategies, multimodal data augmentation, converting tabular data into text, cross-modal alignment, and handling missing modalities. Through extensive experimentation and analysis, we distill a collection of effective strategies and consolidate them into a unified pipeline, achieving robust performance on diverse datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16243v1-abstract-full').style.display = 'none'; document.getElementById('2412.16243v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15501">arXiv:2412.15501</a> <span> [<a href="https://arxiv.org/pdf/2412.15501">pdf</a>, <a href="https://arxiv.org/format/2412.15501">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Humanlike Cognitive Patterns as Emergent Phenomena in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhisheng Tang</a>, <a href="/search/cs?searchtype=author&query=Kejriwal%2C+M">Mayank Kejriwal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15501v1-abstract-short" style="display: inline;"> Research on emergent patterns in Large Language Models (LLMs) has gained significant traction in both psychology and artificial intelligence, motivating the need for a comprehensive review that offers a synthesis of this complex landscape. In this article, we systematically review LLMs' capabilities across three important cognitive domains: decision-making biases, reasoning, and creativity. We use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15501v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15501v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15501v1-abstract-full" style="display: none;"> Research on emergent patterns in Large Language Models (LLMs) has gained significant traction in both psychology and artificial intelligence, motivating the need for a comprehensive review that offers a synthesis of this complex landscape. In this article, we systematically review LLMs' capabilities across three important cognitive domains: decision-making biases, reasoning, and creativity. We use empirical studies drawing on established psychological tests and compare LLMs' performance to human benchmarks. On decision-making, our synthesis reveals that while LLMs demonstrate several human-like biases, some biases observed in humans are absent, indicating cognitive patterns that only partially align with human decision-making. On reasoning, advanced LLMs like GPT-4 exhibit deliberative reasoning akin to human System-2 thinking, while smaller models fall short of human-level performance. A distinct dichotomy emerges in creativity: while LLMs excel in language-based creative tasks, such as storytelling, they struggle with divergent thinking tasks that require real-world context. Nonetheless, studies suggest that LLMs hold considerable potential as collaborators, augmenting creativity in human-machine problem-solving settings. Discussing key limitations, we also offer guidance for future research in areas such as memory, attention, and open-source model development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15501v1-abstract-full').style.display = 'none'; document.getElementById('2412.15501v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15321">arXiv:2412.15321</a> <span> [<a href="https://arxiv.org/pdf/2412.15321">pdf</a>, <a href="https://arxiv.org/format/2412.15321">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Next Patch Prediction for Autoregressive Visual Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yatian Pang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Peng Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bin Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Bin Zhu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenyu Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuhan Chen</a>, <a href="/search/cs?searchtype=author&query=Tay%2C+F+E+H">Francis E. H. Tay</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+S">Ser-Nam Lim</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Harry Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15321v2-abstract-short" style="display: inline;"> Autoregressive models, built based on the Next Token Prediction (NTP) paradigm, show great potential in developing a unified framework that integrates both language and vision tasks. In this work, we rethink the NTP for autoregressive image generation and propose a novel Next Patch Prediction (NPP) paradigm. Our key idea is to group and aggregate image tokens into patch tokens containing high info… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15321v2-abstract-full').style.display = 'inline'; document.getElementById('2412.15321v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15321v2-abstract-full" style="display: none;"> Autoregressive models, built based on the Next Token Prediction (NTP) paradigm, show great potential in developing a unified framework that integrates both language and vision tasks. In this work, we rethink the NTP for autoregressive image generation and propose a novel Next Patch Prediction (NPP) paradigm. Our key idea is to group and aggregate image tokens into patch tokens containing high information density. With patch tokens as a shorter input sequence, the autoregressive model is trained to predict the next patch, thereby significantly reducing the computational cost. We further propose a multi-scale coarse-to-fine patch grouping strategy that exploits the natural hierarchical property of image data. Experiments on a diverse range of models (100M-1.4B parameters) demonstrate that the next patch prediction paradigm could reduce the training cost to around 0.6 times while improving image generation quality by up to 1.0 FID score on the ImageNet benchmark. We highlight that our method retains the original autoregressive model architecture without introducing additional trainable parameters or specifically designing a custom image tokenizer, thus ensuring flexibility and seamless adaptation to various autoregressive models for visual generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15321v2-abstract-full').style.display = 'none'; document.getElementById('2412.15321v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/PKU-YuanGroup/Next-Patch-Prediction, v2: add related work "Patch-Level Training for Large Language Models"</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14304">arXiv:2412.14304</a> <span> [<a href="https://arxiv.org/pdf/2412.14304">pdf</a>, <a href="https://arxiv.org/format/2412.14304">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-OphthaLingua: A Multilingual Benchmark for Assessing and Debiasing LLM Ophthalmological QA in LMICs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Restrepo%2C+D">David Restrepo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenwei Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhengxu Tang</a>, <a href="/search/cs?searchtype=author&query=Shuai%2C+Z">Zitao Shuai</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+T+N+M">Thao Nguyen Minh Phan</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jun-En Ding</a>, <a href="/search/cs?searchtype=author&query=Dao%2C+C">Cong-Tinh Dao</a>, <a href="/search/cs?searchtype=author&query=Gallifant%2C+J">Jack Gallifant</a>, <a href="/search/cs?searchtype=author&query=Dychiao%2C+R+G">Robyn Gayle Dychiao</a>, <a href="/search/cs?searchtype=author&query=Artiaga%2C+J+C">Jose Carlo Artiaga</a>, <a href="/search/cs?searchtype=author&query=Bando%2C+A+H">Andr茅 Hiroshi Bando</a>, <a href="/search/cs?searchtype=author&query=Gracitelli%2C+C+P+B">Carolina Pelegrini Barbosa Gracitelli</a>, <a href="/search/cs?searchtype=author&query=Ferrer%2C+V">Vincenz Ferrer</a>, <a href="/search/cs?searchtype=author&query=Celi%2C+L+A">Leo Anthony Celi</a>, <a href="/search/cs?searchtype=author&query=Bitterman%2C+D">Danielle Bitterman</a>, <a href="/search/cs?searchtype=author&query=Morley%2C+M+G">Michael G Morley</a>, <a href="/search/cs?searchtype=author&query=Nakayama%2C+L+F">Luis Filipe Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14304v1-abstract-short" style="display: inline;"> Current ophthalmology clinical workflows are plagued by over-referrals, long waits, and complex and heterogeneous medical records. Large language models (LLMs) present a promising solution to automate various procedures such as triaging, preliminary tests like visual acuity assessment, and report summaries. However, LLMs have demonstrated significantly varied performance across different languages… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14304v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14304v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14304v1-abstract-full" style="display: none;"> Current ophthalmology clinical workflows are plagued by over-referrals, long waits, and complex and heterogeneous medical records. Large language models (LLMs) present a promising solution to automate various procedures such as triaging, preliminary tests like visual acuity assessment, and report summaries. However, LLMs have demonstrated significantly varied performance across different languages in natural language question-answering tasks, potentially exacerbating healthcare disparities in Low and Middle-Income Countries (LMICs). This study introduces the first multilingual ophthalmological question-answering benchmark with manually curated questions parallel across languages, allowing for direct cross-lingual comparisons. Our evaluation of 6 popular LLMs across 7 different languages reveals substantial bias across different languages, highlighting risks for clinical deployment of LLMs in LMICs. Existing debiasing methods such as Translation Chain-of-Thought or Retrieval-augmented generation (RAG) by themselves fall short of closing this performance gap, often failing to improve performance across all languages and lacking specificity for the medical domain. To address this issue, We propose CLARA (Cross-Lingual Reflective Agentic system), a novel inference time de-biasing method leveraging retrieval augmented generation and self-verification. Our approach not only improves performance across all languages but also significantly reduces the multilingual bias gap, facilitating equitable LLM application across the globe. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14304v1-abstract-full').style.display = 'none'; document.getElementById('2412.14304v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the AAAI 2025 Artificial Intelligence for Social Impact Track (AAAI-AISI 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12310">arXiv:2412.12310</a> <span> [<a href="https://arxiv.org/pdf/2412.12310">pdf</a>, <a href="https://arxiv.org/format/2412.12310">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Second Language (Arabic) Acquisition of LLMs via Progressive Vocabulary Expansion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianqing Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huang Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhihang Lin</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Juhao Liang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhengyang Tang</a>, <a href="/search/cs?searchtype=author&query=Almubarak%2C+K">Khalid Almubarak</a>, <a href="/search/cs?searchtype=author&query=Alharthik%2C+A">Abdulmohsen Alharthik</a>, <a href="/search/cs?searchtype=author&query=An%2C+B">Bang An</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Juncai He</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiangbo Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fei Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junying Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhuoheng Ma</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuhao Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Alghamdi%2C+E+A">Emad A. Alghamdi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lian Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoyu Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haizhou Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jinchao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12310v1-abstract-short" style="display: inline;"> This paper addresses the critical need for democratizing large language models (LLM) in the Arab world, a region that has seen slower progress in developing models comparable to state-of-the-art offerings like GPT-4 or ChatGPT 3.5, due to a predominant focus on mainstream languages (e.g., English and Chinese). One practical objective for an Arabic LLM is to utilize an Arabic-specific vocabulary fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12310v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12310v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12310v1-abstract-full" style="display: none;"> This paper addresses the critical need for democratizing large language models (LLM) in the Arab world, a region that has seen slower progress in developing models comparable to state-of-the-art offerings like GPT-4 or ChatGPT 3.5, due to a predominant focus on mainstream languages (e.g., English and Chinese). One practical objective for an Arabic LLM is to utilize an Arabic-specific vocabulary for the tokenizer that could speed up decoding. However, using a different vocabulary often leads to a degradation of learned knowledge since many words are initially out-of-vocabulary (OOV) when training starts. Inspired by the vocabulary learning during Second Language (Arabic) Acquisition for humans, the released AraLLaMA employs progressive vocabulary expansion, which is implemented by a modified BPE algorithm that progressively extends the Arabic subwords in its dynamic vocabulary during training, thereby balancing the OOV ratio at every stage. The ablation study demonstrated the effectiveness of Progressive Vocabulary Expansion. Moreover, AraLLaMA achieves decent performance comparable to the best Arabic LLMs across a variety of Arabic benchmarks. Models, training data, benchmarks, and codes will be all open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12310v1-abstract-full').style.display = 'none'; document.getElementById('2412.12310v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11599">arXiv:2412.11599</a> <span> [<a href="https://arxiv.org/pdf/2412.11599">pdf</a>, <a href="https://arxiv.org/format/2412.11599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D$^2$-Actor: Learning Pose-Conditioned 3D-Aware Denoiser for Realistic Gaussian Avatar Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zichen Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongyu Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaxin Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11599v1-abstract-short" style="display: inline;"> Advancements in neural implicit representations and differentiable rendering have markedly improved the ability to learn animatable 3D avatars from sparse multi-view RGB videos. However, current methods that map observation space to canonical space often face challenges in capturing pose-dependent details and generalizing to novel poses. While diffusion models have demonstrated remarkable zero-sho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11599v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11599v1-abstract-full" style="display: none;"> Advancements in neural implicit representations and differentiable rendering have markedly improved the ability to learn animatable 3D avatars from sparse multi-view RGB videos. However, current methods that map observation space to canonical space often face challenges in capturing pose-dependent details and generalizing to novel poses. While diffusion models have demonstrated remarkable zero-shot capabilities in 2D image generation, their potential for creating animatable 3D avatars from 2D inputs remains underexplored. In this work, we introduce 3D$^2$-Actor, a novel approach featuring a pose-conditioned 3D-aware human modeling pipeline that integrates iterative 2D denoising and 3D rectifying steps. The 2D denoiser, guided by pose cues, generates detailed multi-view images that provide the rich feature set necessary for high-fidelity 3D reconstruction and pose rendering. Complementing this, our Gaussian-based 3D rectifier renders images with enhanced 3D consistency through a two-stage projection strategy and a novel local coordinate representation. Additionally, we propose an innovative sampling strategy to ensure smooth temporal continuity across frames in video synthesis. Our method effectively addresses the limitations of traditional numerical solutions in handling ill-posed mappings, producing realistic and animatable 3D human avatars. Experimental results demonstrate that 3D$^2$-Actor excels in high-fidelity avatar modeling and robustly generalizes to novel poses. Code is available at: https://github.com/silence-tang/GaussianActor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11599v1-abstract-full').style.display = 'none'; document.getElementById('2412.11599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06974">arXiv:2412.06974</a> <span> [<a href="https://arxiv.org/pdf/2412.06974">pdf</a>, <a href="https://arxiv.org/format/2412.06974">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MV-DUSt3R+: Single-Stage Scene Reconstruction from Sparse Views In 2 Seconds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenggang Tang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yuchen Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dilin Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongyu Xu</a>, <a href="/search/cs?searchtype=author&query=Ranjan%2C+R">Rakesh Ranjan</a>, <a href="/search/cs?searchtype=author&query=Schwing%2C+A">Alexander Schwing</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhicheng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06974v1-abstract-short" style="display: inline;"> Recent sparse multi-view scene reconstruction advances like DUSt3R and MASt3R no longer require camera calibration and camera pose estimation. However, they only process a pair of views at a time to infer pixel-aligned pointmaps. When dealing with more than two views, a combinatorial number of error prone pairwise reconstructions are usually followed by an expensive global optimization, which ofte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06974v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06974v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06974v1-abstract-full" style="display: none;"> Recent sparse multi-view scene reconstruction advances like DUSt3R and MASt3R no longer require camera calibration and camera pose estimation. However, they only process a pair of views at a time to infer pixel-aligned pointmaps. When dealing with more than two views, a combinatorial number of error prone pairwise reconstructions are usually followed by an expensive global optimization, which often fails to rectify the pairwise reconstruction errors. To handle more views, reduce errors, and improve inference time, we propose the fast single-stage feed-forward network MV-DUSt3R. At its core are multi-view decoder blocks which exchange information across any number of views while considering one reference view. To make our method robust to reference view selection, we further propose MV-DUSt3R+, which employs cross-reference-view blocks to fuse information across different reference view choices. To further enable novel view synthesis, we extend both by adding and jointly training Gaussian splatting heads. Experiments on multi-view stereo reconstruction, multi-view pose estimation, and novel view synthesis confirm that our methods improve significantly upon prior art. Code will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06974v1-abstract-full').style.display = 'none'; document.getElementById('2412.06974v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06184">arXiv:2412.06184</a> <span> [<a href="https://arxiv.org/pdf/2412.06184">pdf</a>, <a href="https://arxiv.org/format/2412.06184">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Model Perception of Color Illusions in Photorealistic Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+L">Lingjun Mao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zineng Tang</a>, <a href="/search/cs?searchtype=author&query=Suhr%2C+A">Alane Suhr</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06184v1-abstract-short" style="display: inline;"> We study the perception of color illusions by vision-language models. Color illusion, where a person's visual system perceives color differently from actual color, is well-studied in human vision. However, it remains underexplored whether vision-language models (VLMs), trained on large-scale human data, exhibit similar perceptual biases when confronted with such color illusions. We propose an auto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06184v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06184v1-abstract-full" style="display: none;"> We study the perception of color illusions by vision-language models. Color illusion, where a person's visual system perceives color differently from actual color, is well-studied in human vision. However, it remains underexplored whether vision-language models (VLMs), trained on large-scale human data, exhibit similar perceptual biases when confronted with such color illusions. We propose an automated framework for generating color illusion images, resulting in RCID (Realistic Color Illusion Dataset), a dataset of 19,000 realistic illusion images. Our experiments show that all studied VLMs exhibit perceptual biases similar human vision. Finally, we train a model to distinguish both human perception and actual pixel differences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06184v1-abstract-full').style.display = 'none'; document.getElementById('2412.06184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03876">arXiv:2412.03876</a> <span> [<a href="https://arxiv.org/pdf/2412.03876">pdf</a>, <a href="https://arxiv.org/format/2412.03876">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Safeguarding Text-to-Image Generation via Inference-Time Prompt-Noise Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jiangweizhi Peng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhiwei Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gaowen Liu</a>, <a href="/search/cs?searchtype=author&query=Fleming%2C+C">Charles Fleming</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+M">Mingyi Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03876v1-abstract-short" style="display: inline;"> Text-to-Image (T2I) diffusion models are widely recognized for their ability to generate high-quality and diverse images based on text prompts. However, despite recent advances, these models are still prone to generating unsafe images containing sensitive or inappropriate content, which can be harmful to users. Current efforts to prevent inappropriate image generation for diffusion models are easy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03876v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03876v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03876v1-abstract-full" style="display: none;"> Text-to-Image (T2I) diffusion models are widely recognized for their ability to generate high-quality and diverse images based on text prompts. However, despite recent advances, these models are still prone to generating unsafe images containing sensitive or inappropriate content, which can be harmful to users. Current efforts to prevent inappropriate image generation for diffusion models are easy to bypass and vulnerable to adversarial attacks. How to ensure that T2I models align with specific safety goals remains a significant challenge. In this work, we propose a novel, training-free approach, called Prompt-Noise Optimization (PNO), to mitigate unsafe image generation. Our method introduces a novel optimization framework that leverages both the continuous prompt embedding and the injected noise trajectory in the sampling process to generate safe images. Extensive numerical results demonstrate that our framework achieves state-of-the-art performance in suppressing toxic image generations and demonstrates robustness to adversarial attacks, without needing to tune the model parameters. Furthermore, compared with existing methods, PNO uses comparable generation time while offering the best tradeoff between the conflicting goals of safe generation and prompt-image alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03876v1-abstract-full').style.display = 'none'; document.getElementById('2412.03876v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03293">arXiv:2412.03293</a> <span> [<a href="https://arxiv.org/pdf/2412.03293">pdf</a>, <a href="https://arxiv.org/format/2412.03293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion-VLA: Scaling Robot Foundation Models via Unified Diffusion and Autoregression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+J">Junjie Wen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minjie Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yichen Zhu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhibin Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinming Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhongyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengmeng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyu Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yaxin Peng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chaomin Shen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Feifei Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03293v1-abstract-short" style="display: inline;"> In this paper, we present DiffusionVLA, a novel framework that seamlessly combines the autoregression model with the diffusion model for learning visuomotor policy. Central to our approach is a next-token prediction objective, enabling the model to reason effectively over the user's query in the context of current observations. Subsequently, a diffusion model is attached to generate robust action… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03293v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03293v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03293v1-abstract-full" style="display: none;"> In this paper, we present DiffusionVLA, a novel framework that seamlessly combines the autoregression model with the diffusion model for learning visuomotor policy. Central to our approach is a next-token prediction objective, enabling the model to reason effectively over the user's query in the context of current observations. Subsequently, a diffusion model is attached to generate robust action outputs. To enhance policy learning through self-reasoning, we introduce a novel reasoning injection module that integrates reasoning phrases directly into the policy learning process. The whole framework is simple and flexible, making it easy to deploy and upgrade. We conduct extensive experiments using multiple real robots to validate the effectiveness of DiffusionVLA. Our tests include a challenging factory sorting task, where DiffusionVLA successfully categorizes objects, including those not seen during training. We observe that the reasoning module makes the model interpretable. It allows observers to understand the model thought process and identify potential causes of policy failures. Additionally, we test DiffusionVLA on a zero-shot bin-picking task, achieving 63.7\% accuracy on 102 previously unseen objects. Our method demonstrates robustness to visual changes, such as distractors and new backgrounds, and easily adapts to new embodiments. Furthermore, DiffusionVLA can follow novel instructions and retain conversational ability. Notably, DiffusionVLA is data-efficient and fast at inference; our smallest DiffusionVLA-2B runs 82Hz on a single A6000 GPU and can train from scratch on less than 50 demonstrations for a complex task. Finally, we scale the model from 2B to 72B parameters, showcasing improved generalization capabilities with increased model size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03293v1-abstract-full').style.display = 'none'; document.getElementById('2412.03293v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The project page is available at: http://diffusion-vla.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01615">arXiv:2412.01615</a> <span> [<a href="https://arxiv.org/pdf/2412.01615">pdf</a>, <a href="https://arxiv.org/format/2412.01615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OmniGuard: Hybrid Manipulation Localization via Augmented Versatile Deep Image Watermarking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuanyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zecheng Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhipei Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Runyi Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Youmin Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+F">Feng Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01615v1-abstract-short" style="display: inline;"> With the rapid growth of generative AI and its widespread application in image editing, new risks have emerged regarding the authenticity and integrity of digital content. Existing versatile watermarking approaches suffer from trade-offs between tamper localization precision and visual quality. Constrained by the limited flexibility of previous framework, their localized watermark must remain fixe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01615v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01615v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01615v1-abstract-full" style="display: none;"> With the rapid growth of generative AI and its widespread application in image editing, new risks have emerged regarding the authenticity and integrity of digital content. Existing versatile watermarking approaches suffer from trade-offs between tamper localization precision and visual quality. Constrained by the limited flexibility of previous framework, their localized watermark must remain fixed across all images. Under AIGC-editing, their copyright extraction accuracy is also unsatisfactory. To address these challenges, we propose OmniGuard, a novel augmented versatile watermarking approach that integrates proactive embedding with passive, blind extraction for robust copyright protection and tamper localization. OmniGuard employs a hybrid forensic framework that enables flexible localization watermark selection and introduces a degradation-aware tamper extraction network for precise localization under challenging conditions. Additionally, a lightweight AIGC-editing simulation layer is designed to enhance robustness across global and local editing. Extensive experiments show that OmniGuard achieves superior fidelity, robustness, and flexibility. Compared to the recent state-of-the-art approach EditGuard, our method outperforms it by 4.25dB in PSNR of the container image, 20.7% in F1-Score under noisy conditions, and 14.8% in average bit accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01615v1-abstract-full').style.display = 'none'; document.getElementById('2412.01615v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00692">arXiv:2412.00692</a> <span> [<a href="https://arxiv.org/pdf/2412.00692">pdf</a>, <a href="https://arxiv.org/format/2412.00692">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BEV-SUSHI: Multi-Target Multi-Camera 3D Detection and Tracking in Bird's-Eye View </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Meinhardt%2C+T">Tim Meinhardt</a>, <a href="/search/cs?searchtype=author&query=Cetintas%2C+O">Orcun Cetintas</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng-Yen Yang</a>, <a href="/search/cs?searchtype=author&query=Pusegaonkar%2C+S+S">Sameer Satish Pusegaonkar</a>, <a href="/search/cs?searchtype=author&query=Missaoui%2C+B">Benjamin Missaoui</a>, <a href="/search/cs?searchtype=author&query=Biswas%2C+S">Sujit Biswas</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zheng Tang</a>, <a href="/search/cs?searchtype=author&query=Leal-Taix%C3%A9%2C+L">Laura Leal-Taix茅</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00692v2-abstract-short" style="display: inline;"> Object perception from multi-view cameras is crucial for intelligent systems, particularly in indoor environments, e.g., warehouses, retail stores, and hospitals. Most traditional multi-target multi-camera (MTMC) detection and tracking methods rely on 2D object detection, single-view multi-object tracking (MOT), and cross-view re-identification (ReID) techniques, without properly handling importan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00692v2-abstract-full').style.display = 'inline'; document.getElementById('2412.00692v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00692v2-abstract-full" style="display: none;"> Object perception from multi-view cameras is crucial for intelligent systems, particularly in indoor environments, e.g., warehouses, retail stores, and hospitals. Most traditional multi-target multi-camera (MTMC) detection and tracking methods rely on 2D object detection, single-view multi-object tracking (MOT), and cross-view re-identification (ReID) techniques, without properly handling important 3D information by multi-view image aggregation. In this paper, we propose a 3D object detection and tracking framework, named BEV-SUSHI, which first aggregates multi-view images with necessary camera calibration parameters to obtain 3D object detections in bird's-eye view (BEV). Then, we introduce hierarchical graph neural networks (GNNs) to track these 3D detections in BEV for MTMC tracking results. Unlike existing methods, BEV-SUSHI has impressive generalizability across different scenes and diverse camera settings, with exceptional capability for long-term association handling. As a result, our proposed BEV-SUSHI establishes the new state-of-the-art on the AICity'24 dataset with 81.22 HOTA, and 95.6 IDF1 on the WildTrack dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00692v2-abstract-full').style.display = 'none'; document.getElementById('2412.00692v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00653">arXiv:2412.00653</a> <span> [<a href="https://arxiv.org/pdf/2412.00653">pdf</a>, <a href="https://arxiv.org/format/2412.00653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Predictive Inference With Fast Feature Conformal Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zihao Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Chuan Wen</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+J">Jiaye Teng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00653v1-abstract-short" style="display: inline;"> Conformal prediction is widely adopted in uncertainty quantification, due to its post-hoc, distribution-free, and model-agnostic properties. In the realm of modern deep learning, researchers have proposed Feature Conformal Prediction (FCP), which deploys conformal prediction in a feature space, yielding reduced band lengths. However, the practical utility of FCP is limited due to the time-consumin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00653v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00653v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00653v1-abstract-full" style="display: none;"> Conformal prediction is widely adopted in uncertainty quantification, due to its post-hoc, distribution-free, and model-agnostic properties. In the realm of modern deep learning, researchers have proposed Feature Conformal Prediction (FCP), which deploys conformal prediction in a feature space, yielding reduced band lengths. However, the practical utility of FCP is limited due to the time-consuming non-linear operations required to transform confidence bands from feature space to output space. In this paper, we introduce Fast Feature Conformal Prediction (FFCP), which features a novel non-conformity score and is convenient for practical applications. FFCP serves as a fast version of FCP, in that it equivalently employs a Taylor expansion to approximate the aforementioned non-linear operations in FCP. Empirical validations showcase that FFCP performs comparably with FCP (both outperforming the vanilla version) while achieving a significant reduction in computational time by approximately 50x. The code is available at https://github.com/ElvisWang1111/FastFeatureCP <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00653v1-abstract-full').style.display = 'none'; document.getElementById('2412.00653v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00131">arXiv:2412.00131</a> <span> [<a href="https://arxiv.org/pdf/2412.00131">pdf</a>, <a href="https://arxiv.org/format/2412.00131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Open-Sora Plan: Open-Source Large Video Generation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bin Lin</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yunyang Ge</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xinhua Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongjian Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Bin Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaodong Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xianyi He</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yang Ye</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shenghai Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuhan Chen</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+T">Tanghui Jia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junwu Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenyu Tang</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yatian Pang</a>, <a href="/search/cs?searchtype=author&query=She%2C+B">Bin She</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Cen Yan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiheng Hu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaoyi Dong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lin Chen</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhang Pan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xing Zhou</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+S">Shaoling Dong</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yonghong Tian</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00131v1-abstract-short" style="display: inline;"> We introduce Open-Sora Plan, an open-source project that aims to contribute a large generation model for generating desired high-resolution videos with long durations based on various user inputs. Our project comprises multiple components for the entire video generation process, including a Wavelet-Flow Variational Autoencoder, a Joint Image-Video Skiparse Denoiser, and various condition controlle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00131v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00131v1-abstract-full" style="display: none;"> We introduce Open-Sora Plan, an open-source project that aims to contribute a large generation model for generating desired high-resolution videos with long durations based on various user inputs. Our project comprises multiple components for the entire video generation process, including a Wavelet-Flow Variational Autoencoder, a Joint Image-Video Skiparse Denoiser, and various condition controllers. Moreover, many assistant strategies for efficient training and inference are designed, and a multi-dimensional data curation pipeline is proposed for obtaining desired high-quality data. Benefiting from efficient thoughts, our Open-Sora Plan achieves impressive video generation results in both qualitative and quantitative evaluations. We hope our careful design and practical experience can inspire the video generation research community. All our codes and model weights are publicly available at \url{https://github.com/PKU-YuanGroup/Open-Sora-Plan}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00131v1-abstract-full').style.display = 'none'; document.getElementById('2412.00131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v1.3</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15583">arXiv:2411.15583</a> <span> [<a href="https://arxiv.org/pdf/2411.15583">pdf</a>, <a href="https://arxiv.org/format/2411.15583">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Exploring Viewing Modalities in Cinematic Virtual Reality: A Systematic Review and Meta-Analysis of Challenges in Evaluating User Experience </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yawen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Han Zhou</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhoumingju Jiang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zilu Tang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+T">Tao Luo</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Q">Qinyuan Lei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15583v1-abstract-short" style="display: inline;"> Cinematic Virtual Reality (CVR) is a narrative-driven VR experience that uses head-mounted displays with a 360-degree field of view. Previous research has explored different viewing modalities to enhance viewers' CVR experience. This study conducted a systematic review and meta-analysis focusing on how different viewing modalities, including intervened rotation, avatar assistance, guidance cues, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15583v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15583v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15583v1-abstract-full" style="display: none;"> Cinematic Virtual Reality (CVR) is a narrative-driven VR experience that uses head-mounted displays with a 360-degree field of view. Previous research has explored different viewing modalities to enhance viewers' CVR experience. This study conducted a systematic review and meta-analysis focusing on how different viewing modalities, including intervened rotation, avatar assistance, guidance cues, and perspective shifting, influence the CVR experience. The study has screened 3444 papers (between 01/01/2013 and 17/06/2023) and selected 45 for systematic review, 13 of which also for meta-analysis. We conducted separate random-effects meta-analysis and applied Robust Variance Estimation to examine CVR viewing modalities and user experience outcomes. Evidence from experiments was synthesized as differences between standardized mean differences (SMDs) of user experience of control group ("Swivel-Chair" CVR) and experiment groups. To our surprise, we found inconsistencies in the effect sizes across different studies, even with the same viewing modalities. Moreover, in these studies, terms such as "presence," "immersion," and "narrative engagement" were often used interchangeably. Their irregular use of questionnaires, overreliance on self-developed questionnaires, and incomplete data reporting may have led to unrigorous evaluations of CVR experiences. This study contributes to Human-Computer Interaction (HCI) research by identifying gaps in CVR research, emphasizing the need for standardization of terminologies and methodologies to enhance the reliability and comparability of future CVR research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15583v1-abstract-full').style.display = 'none'; document.getElementById('2411.15583v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, recommend for acceptance by CSCW</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08196">arXiv:2411.08196</a> <span> [<a href="https://arxiv.org/pdf/2411.08196">pdf</a>, <a href="https://arxiv.org/format/2411.08196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Latent Space Disentanglement in Diffusion Transformers Enables Precise Zero-shot Semantic Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shuai%2C+Z">Zitao Shuai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenwei Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhengxu Tang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+B">Bowen Song</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Liyue Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08196v1-abstract-short" style="display: inline;"> Diffusion Transformers (DiTs) have recently achieved remarkable success in text-guided image generation. In image editing, DiTs project text and image inputs to a joint latent space, from which they decode and synthesize new images. However, it remains largely unexplored how multimodal information collectively forms this joint space and how they guide the semantics of the synthesized images. In th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08196v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08196v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08196v1-abstract-full" style="display: none;"> Diffusion Transformers (DiTs) have recently achieved remarkable success in text-guided image generation. In image editing, DiTs project text and image inputs to a joint latent space, from which they decode and synthesize new images. However, it remains largely unexplored how multimodal information collectively forms this joint space and how they guide the semantics of the synthesized images. In this paper, we investigate the latent space of DiT models and uncover two key properties: First, DiT's latent space is inherently semantically disentangled, where different semantic attributes can be controlled by specific editing directions. Second, consistent semantic editing requires utilizing the entire joint latent space, as neither encoded image nor text alone contains enough semantic information. We show that these editing directions can be obtained directly from text prompts, enabling precise semantic control without additional training or mask annotations. Based on these insights, we propose a simple yet effective Encode-Identify-Manipulate (EIM) framework for zero-shot fine-grained image editing. Specifically, we first encode both the given source image and the text prompt that describes the image, to obtain the joint latent embedding. Then, using our proposed Hessian Score Distillation Sampling (HSDS) method, we identify editing directions that control specific target attributes while preserving other image features. These directions are guided by text prompts and used to manipulate the latent embeddings. Moreover, we propose a new metric to quantify the disentanglement degree of the latent space of diffusion models. Extensive experiment results on our new curated benchmark dataset and analysis demonstrate DiT's disentanglement properties and effectiveness of the EIM framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08196v1-abstract-full').style.display = 'none'; document.getElementById('2411.08196v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2408.13335</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05027">arXiv:2411.05027</a> <span> [<a href="https://arxiv.org/pdf/2411.05027">pdf</a>, <a href="https://arxiv.org/format/2411.05027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/MGRS.2024.3483459">10.1109/MGRS.2024.3483459 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Generative Artificial Intelligence Meets Synthetic Aperture Radar: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhongling Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xidan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zuqian Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Feng Xu</a>, <a href="/search/cs?searchtype=author&query=Datcu%2C+M">Mihai Datcu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Junwei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05027v1-abstract-short" style="display: inline;"> SAR images possess unique attributes that present challenges for both human observers and vision AI models to interpret, owing to their electromagnetic characteristics. The interpretation of SAR images encounters various hurdles, with one of the primary obstacles being the data itself, which includes issues related to both the quantity and quality of the data. The challenges can be addressed using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05027v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05027v1-abstract-full" style="display: none;"> SAR images possess unique attributes that present challenges for both human observers and vision AI models to interpret, owing to their electromagnetic characteristics. The interpretation of SAR images encounters various hurdles, with one of the primary obstacles being the data itself, which includes issues related to both the quantity and quality of the data. The challenges can be addressed using generative AI technologies. Generative AI, often known as GenAI, is a very advanced and powerful technology in the field of artificial intelligence that has gained significant attention. The advancement has created possibilities for the creation of texts, photorealistic pictures, videos, and material in various modalities. This paper aims to comprehensively investigate the intersection of GenAI and SAR. First, we illustrate the common data generation-based applications in SAR field and compare them with computer vision tasks, analyzing the similarity, difference, and general challenges of them. Then, an overview of the latest GenAI models is systematically reviewed, including various basic models and their variations targeting the general challenges. Additionally, the corresponding applications in SAR domain are also included. Specifically, we propose to summarize the physical model based simulation approaches for SAR, and analyze the hybrid modeling methods that combine the GenAI and interpretable models. The evaluation methods that have been or could be applied to SAR, are also explored. Finally, the potential challenges and future prospects are discussed. To our best knowledge, this survey is the first exhaustive examination of the interdiscipline of SAR and GenAI, encompassing a wide range of topics, including deep neural networks, physical models, computer vision, and SAR images. The resources of this survey are open-source at \url{https://github.com/XAI4SAR/GenAIxSAR}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05027v1-abstract-full').style.display = 'none'; document.getElementById('2411.05027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Tang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>