CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 829 results for author: <span class="mathjax">Zhu, S</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Zhu%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhu, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhu%2C+S&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhu, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07615">arXiv:2502.07615</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.07615">pdf</a>, <a href="https://arxiv.org/format/2502.07615">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Flow Distillation Sampling: Regularizing 3D Gaussians with Pre-trained Matching Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lin-Zhuo Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+K">Kangjie Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Youtian Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Siyu Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhihao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+X">Xun Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yao Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07615v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has achieved excellent rendering quality with fast training and rendering speed. However, its optimization process lacks explicit geometric constraints, leading to suboptimal geometric reconstruction in regions with sparse or no observational input views. In this work, we try to mitigate the issue by incorporating a pre-trained matching prior to the 3DGS optimization p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07615v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07615v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07615v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has achieved excellent rendering quality with fast training and rendering speed. However, its optimization process lacks explicit geometric constraints, leading to suboptimal geometric reconstruction in regions with sparse or no observational input views. In this work, we try to mitigate the issue by incorporating a pre-trained matching prior to the 3DGS optimization process. We introduce Flow Distillation Sampling (FDS), a technique that leverages pre-trained geometric knowledge to bolster the accuracy of the Gaussian radiance field. Our method employs a strategic sampling technique to target unobserved views adjacent to the input views, utilizing the optical flow calculated from the matching model (Prior Flow) to guide the flow analytically calculated from the 3DGS geometry (Radiance Flow). Comprehensive experiments in depth rendering, mesh reconstruction, and novel view synthesis showcase the significant advantages of FDS over state-of-the-art methods. Additionally, our interpretive experiments and analysis aim to shed light on the effects of FDS on geometric accuracy and rendering quality, potentially providing readers with insights into its performance. Project page: https://nju-3dv.github.io/projects/fds <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07615v1-abstract-full').style.display = 'none'; document.getElementById('2502.07615v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06431">arXiv:2502.06431</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.06431">pdf</a>, <a href="https://arxiv.org/format/2502.06431">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FCVSR: A Frequency-aware Method for Compressed Video Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Q">Qiang Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+F">Feiyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shuyuan Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Bull%2C+D">David Bull</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+B">Bing Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06431v1-abstract-short" style="display: inline;"> Compressed video super-resolution (SR) aims to generate high-resolution (HR) videos from the corresponding low-resolution (LR) compressed videos. Recently, some compressed video SR methods attempt to exploit the spatio-temporal information in the frequency domain, showing great promise in super-resolution performance. However, these methods do not differentiate various frequency subbands spatially&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06431v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06431v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06431v1-abstract-full" style="display: none;"> Compressed video super-resolution (SR) aims to generate high-resolution (HR) videos from the corresponding low-resolution (LR) compressed videos. Recently, some compressed video SR methods attempt to exploit the spatio-temporal information in the frequency domain, showing great promise in super-resolution performance. However, these methods do not differentiate various frequency subbands spatially or capture the temporal frequency dynamics, potentially leading to suboptimal results. In this paper, we propose a deep frequency-based compressed video SR model (FCVSR) consisting of a motion-guided adaptive alignment (MGAA) network and a multi-frequency feature refinement (MFFR) module. Additionally, a frequency-aware contrastive loss is proposed for training FCVSR, in order to reconstruct finer spatial details. The proposed model has been evaluated on three public compressed video super-resolution datasets, with results demonstrating its effectiveness when compared to existing works in terms of super-resolution performance (up to a 0.14dB gain in PSNR over the second-best model) and complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06431v1-abstract-full').style.display = 'none'; document.getElementById('2502.06431v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05468">arXiv:2502.05468</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.05468">pdf</a>, <a href="https://arxiv.org/format/2502.05468">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Gen-DFL: Decision-Focused Generative Learning for Robust Decision Making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P+Z">Prince Zizhuang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+J">Jinhao Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shuyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Fioretto%2C+F">Ferdinando Fioretto</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shixiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05468v1-abstract-short" style="display: inline;"> Decision-focused learning (DFL) integrates predictive models with downstream optimization, directly training machine learning models to minimize decision errors. While DFL has been shown to provide substantial advantages when compared to a counterpart that treats the predictive and prescriptive models separately, it has also been shown to struggle in high-dimensional and risk-sensitive settings, l&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05468v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05468v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05468v1-abstract-full" style="display: none;"> Decision-focused learning (DFL) integrates predictive models with downstream optimization, directly training machine learning models to minimize decision errors. While DFL has been shown to provide substantial advantages when compared to a counterpart that treats the predictive and prescriptive models separately, it has also been shown to struggle in high-dimensional and risk-sensitive settings, limiting its applicability in real-world settings. To address this limitation, this paper introduces decision-focused generative learning (Gen-DFL), a novel framework that leverages generative models to adaptively model uncertainty and improve decision quality. Instead of relying on fixed uncertainty sets, Gen-DFL learns a structured representation of the optimization parameters and samples from the tail regions of the learned distribution to enhance robustness against worst-case scenarios. This approach mitigates over-conservatism while capturing complex dependencies in the parameter space. The paper shows, theoretically, that Gen-DFL achieves improved worst-case performance bounds compared to traditional DFL. Empirically, it evaluates Gen-DFL on various scheduling and logistics problems, demonstrating its strong performance against existing DFL methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05468v1-abstract-full').style.display = 'none'; document.getElementById('2502.05468v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01906">arXiv:2502.01906</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.01906">pdf</a>, <a href="https://arxiv.org/format/2502.01906">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Homogeneity of Vision and Text Tokens in Large Vision-and-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kuo%2C+C">Chia-Wen Kuo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Sijie Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+F">Fan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+X">Xiaohui Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+L">Longyin Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01906v1-abstract-short" style="display: inline;"> Large vision-and-language models (LVLMs) typically treat visual and textual embeddings as homogeneous inputs to a large language model (LLM). However, these inputs are inherently different: visual inputs are multi-dimensional and contextually rich, often pre-encoded by models like CLIP, while textual inputs lack this structure. In this paper, we propose Decomposed Attention (D-Attn), a novel metho&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01906v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01906v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01906v1-abstract-full" style="display: none;"> Large vision-and-language models (LVLMs) typically treat visual and textual embeddings as homogeneous inputs to a large language model (LLM). However, these inputs are inherently different: visual inputs are multi-dimensional and contextually rich, often pre-encoded by models like CLIP, while textual inputs lack this structure. In this paper, we propose Decomposed Attention (D-Attn), a novel method that processes visual and textual embeddings differently by decomposing the 1-D causal self-attention in LVLMs. After the attention decomposition, D-Attn diagonalizes visual-to-visual self-attention, reducing computation from $\mathcal{O}(|V|^2)$ to $\mathcal{O}(|V|)$ for $|V|$ visual embeddings without compromising performance. Moreover, D-Attn debiases positional encodings in textual-to-visual cross-attention, further enhancing visual understanding. Finally, we introduce an $伪$-weighting strategy to merge visual and textual information, maximally preserving the pre-trained LLM&#39;s capabilities with minimal modifications. Extensive experiments and rigorous analyses validate the effectiveness of D-Attn, demonstrating significant improvements on multiple image benchmarks while significantly reducing computational costs. Code, data, and models will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01906v1-abstract-full').style.display = 'none'; document.getElementById('2502.01906v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01536">arXiv:2502.01536</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.01536">pdf</a>, <a href="https://arxiv.org/format/2502.01536">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VR-Robo: A Real-to-Sim-to-Real Framework for Visual Robot Navigation and Locomotion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shaoting Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Mou%2C+L">Linzhan Mou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Derun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+B">Baijun Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+R">Runhan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Hang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01536v1-abstract-short" style="display: inline;"> Recent success in legged robot locomotion is attributed to the integration of reinforcement learning and physical simulators. However, these policies often encounter challenges when deployed in real-world environments due to sim-to-real gaps, as simulators typically fail to replicate visual realism and complex real-world geometry. Moreover, the lack of realistic visual rendering limits the ability&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01536v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01536v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01536v1-abstract-full" style="display: none;"> Recent success in legged robot locomotion is attributed to the integration of reinforcement learning and physical simulators. However, these policies often encounter challenges when deployed in real-world environments due to sim-to-real gaps, as simulators typically fail to replicate visual realism and complex real-world geometry. Moreover, the lack of realistic visual rendering limits the ability of these policies to support high-level tasks requiring RGB-based perception like ego-centric navigation. This paper presents a Real-to-Sim-to-Real framework that generates photorealistic and physically interactive &#34;digital twin&#34; simulation environments for visual navigation and locomotion learning. Our approach leverages 3D Gaussian Splatting (3DGS) based scene reconstruction from multi-view images and integrates these environments into simulations that support ego-centric visual perception and mesh-based physical interactions. To demonstrate its effectiveness, we train a reinforcement learning policy within the simulator to perform a visual goal-tracking task. Extensive experiments show that our framework achieves RGB-only sim-to-real policy transfer. Additionally, our framework facilitates the rapid adaptation of robot policies with effective exploration capability in complex new environments, highlighting its potential for applications in households and factories. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01536v1-abstract-full').style.display = 'none'; document.getElementById('2502.01536v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://vr-robo.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18110">arXiv:2501.18110</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.18110">pdf</a>, <a href="https://arxiv.org/format/2501.18110">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LRA.2024.3417113">10.1109/LRA.2024.3417113 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Lifelong 3D Mapping Framework for Hand-held &amp; Robot-mounted LiDAR Mapping Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Liudi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Prakhya%2C+S+M">Sai Manoj Prakhya</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Senhua Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziyuan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18110v1-abstract-short" style="display: inline;"> We propose a lifelong 3D mapping framework that is modular, cloud-native by design and more importantly, works for both hand-held and robot-mounted 3D LiDAR mapping systems. Our proposed framework comprises of dynamic point removal, multi-session map alignment, map change detection and map version control. First, our sensor-setup agnostic dynamic point removal algorithm works seamlessly with both&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18110v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18110v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18110v1-abstract-full" style="display: none;"> We propose a lifelong 3D mapping framework that is modular, cloud-native by design and more importantly, works for both hand-held and robot-mounted 3D LiDAR mapping systems. Our proposed framework comprises of dynamic point removal, multi-session map alignment, map change detection and map version control. First, our sensor-setup agnostic dynamic point removal algorithm works seamlessly with both hand-held and robot-mounted setups to produce clean static 3D maps. Second, the multi-session map alignment aligns these clean static maps automatically, without manual parameter fine-tuning, into a single reference frame, using a two stage approach based on feature descriptor matching and fine registration. Third, our novel map change detection identifies positive and negative changes between two aligned maps. Finally, the map version control maintains a single base map that represents the current state of the environment, and stores the detected positive and negative changes, and boundary information. Our unique map version control system can reconstruct any of the previous clean session maps and allows users to query changes between any two random mapping sessions, all without storing any input raw session maps, making it very unique. Extensive experiments are performed using hand-held commercial LiDAR mapping devices and open-source robot-mounted LiDAR SLAM algorithms to evaluate each module and the whole 3D lifelong mapping framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18110v1-abstract-full').style.display = 'none'; document.getElementById('2501.18110v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Robotics and Automation Letters, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15598">arXiv:2501.15598</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.15598">pdf</a>, <a href="https://arxiv.org/format/2501.15598">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Generative Modeling for Spatially Resolved Gene Expression Inference from Histology Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Sichen Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yuchen Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+M">Molei Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+P">Peng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15598v1-abstract-short" style="display: inline;"> Spatial Transcriptomics (ST) allows a high-resolution measurement of RNA sequence abundance by systematically connecting cell morphology depicted in Hematoxylin and Eosin (H&amp;E) stained histology images to spatially resolved gene expressions. ST is a time-consuming, expensive yet powerful experimental technique that provides new opportunities to understand cancer mechanisms at a fine-grained molecu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15598v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15598v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15598v1-abstract-full" style="display: none;"> Spatial Transcriptomics (ST) allows a high-resolution measurement of RNA sequence abundance by systematically connecting cell morphology depicted in Hematoxylin and Eosin (H&amp;E) stained histology images to spatially resolved gene expressions. ST is a time-consuming, expensive yet powerful experimental technique that provides new opportunities to understand cancer mechanisms at a fine-grained molecular level, which is critical for uncovering new approaches for disease diagnosis and treatments. Here, we present $\textbf{Stem}$ ($\textbf{S}$pa$\textbf{T}$ially resolved gene $\textbf{E}$xpression inference with diffusion $\textbf{M}$odel), a novel computational tool that leverages a conditional diffusion generative model to enable in silico gene expression inference from H&amp;E stained images. Through better capturing the inherent stochasticity and heterogeneity in ST data, $\textbf{Stem}$ achieves state-of-the-art performance on spatial gene expression prediction and generates biologically meaningful gene profiles for new H&amp;E stained images at test time. We evaluate the proposed algorithm on datasets with various tissue sources and sequencing platforms, where it demonstrates clear improvement over existing approaches. $\textbf{Stem}$ generates high-fidelity gene expression predictions that share similar gene variation levels as ground truth data, suggesting that our method preserves the underlying biological heterogeneity. Our proposed pipeline opens up the possibility of analyzing existing, easily accessible H&amp;E stained histology images from a genomics point of view without physically performing gene expression profiling and empowers potential biological discovery from H&amp;E stained histology images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15598v1-abstract-full').style.display = 'none'; document.getElementById('2501.15598v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14008">arXiv:2501.14008</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.14008">pdf</a>, <a href="https://arxiv.org/format/2501.14008">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> WAFBOOSTER: Automatic Boosting of WAF Security Against Mutated Malicious Payloads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Cong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Simeng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+W">Wenqi Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+R">Ruiying Du</a>, <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+Y">Yang Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14008v1-abstract-short" style="display: inline;"> Web application firewall (WAF) examines malicious traffic to and from a web application via a set of security rules. It plays a significant role in securing Web applications against web attacks. However, as web attacks grow in sophistication, it is becoming increasingly difficult for WAFs to block the mutated malicious payloads designed to bypass their defenses. In response to this critical securi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14008v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14008v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14008v1-abstract-full" style="display: none;"> Web application firewall (WAF) examines malicious traffic to and from a web application via a set of security rules. It plays a significant role in securing Web applications against web attacks. However, as web attacks grow in sophistication, it is becoming increasingly difficult for WAFs to block the mutated malicious payloads designed to bypass their defenses. In response to this critical security issue, we have developed a novel learning-based framework called WAFBOOSTER, designed to unveil potential bypasses in WAF detections and suggest rules to fortify their security. Using a combination of shadow models and payload generation techniques, we can identify malicious payloads and remove or modify them as needed. WAFBOOSTER generates signatures for these malicious payloads using advanced clustering and regular expression matching techniques to repair any security gaps we uncover. In our comprehensive evaluation of eight real-world WAFs, WAFBOOSTER improved the true rejection rate of mutated malicious payloads from 21% to 96%, with no false rejections. WAFBOOSTER achieves a false acceptance rate 3X lower than state-of-the-art methods for generating malicious payloads. With WAFBOOSTER, we have taken a step forward in securing web applications against the ever-evolving threats. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14008v1-abstract-full').style.display = 'none'; document.getElementById('2501.14008v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12667">arXiv:2501.12667</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.12667">pdf</a>, <a href="https://arxiv.org/format/2501.12667">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sequential Change Point Detection via Denoising Score Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+W">Wenbin Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+L">Liyan Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Z">Zhigang Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shixiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12667v1-abstract-short" style="display: inline;"> Sequential change-point detection plays a critical role in numerous real-world applications, where timely identification of distributional shifts can greatly mitigate adverse outcomes. Classical methods commonly rely on parametric density assumptions of pre- and post-change distributions, limiting their effectiveness for high-dimensional, complex data streams. This paper proposes a score-based CUS&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12667v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12667v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12667v1-abstract-full" style="display: none;"> Sequential change-point detection plays a critical role in numerous real-world applications, where timely identification of distributional shifts can greatly mitigate adverse outcomes. Classical methods commonly rely on parametric density assumptions of pre- and post-change distributions, limiting their effectiveness for high-dimensional, complex data streams. This paper proposes a score-based CUSUM change-point detection, in which the score functions of the data distribution are estimated by injecting noise and applying denoising score matching. We consider both offline and online versions of score estimation. Through theoretical analysis, we demonstrate that denoising score matching can enhance detection power by effectively controlling the injected noise scale. Finally, we validate the practical efficacy of our method through numerical experiments on two synthetic datasets and a real-world earthquake precursor detection task, demonstrating its effectiveness in challenging scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12667v1-abstract-full').style.display = 'none'; document.getElementById('2501.12667v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12375">arXiv:2501.12375</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.12375">pdf</a>, <a href="https://arxiv.org/format/2501.12375">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Video Depth Anything: Consistent Depth Estimation for Super-Long Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Sili Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">Hengkai Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shengnan Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Feihu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zilong Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+J">Jiashi Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+B">Bingyi Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12375v2-abstract-short" style="display: inline;"> Depth Anything has achieved remarkable success in monocular depth estimation with strong generalization ability. However, it suffers from temporal inconsistency in videos, hindering its practical applications. Various methods have been proposed to alleviate this issue by leveraging video generation models or introducing priors from optical flow and camera poses. Nonetheless, these methods are only&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12375v2-abstract-full').style.display = 'inline'; document.getElementById('2501.12375v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12375v2-abstract-full" style="display: none;"> Depth Anything has achieved remarkable success in monocular depth estimation with strong generalization ability. However, it suffers from temporal inconsistency in videos, hindering its practical applications. Various methods have been proposed to alleviate this issue by leveraging video generation models or introducing priors from optical flow and camera poses. Nonetheless, these methods are only applicable to short videos (&lt; 10 seconds) and require a trade-off between quality and computational efficiency. We propose Video Depth Anything for high-quality, consistent depth estimation in super-long videos (over several minutes) without sacrificing efficiency. We base our model on Depth Anything V2 and replace its head with an efficient spatial-temporal head. We design a straightforward yet effective temporal consistency loss by constraining the temporal depth gradient, eliminating the need for additional geometric priors. The model is trained on a joint dataset of video depth and unlabeled images, similar to Depth Anything V2. Moreover, a novel key-frame-based strategy is developed for long video inference. Experiments show that our model can be applied to arbitrarily long videos without compromising quality, consistency, or generalization ability. Comprehensive evaluations on multiple video benchmarks demonstrate that our approach sets a new state-of-the-art in zero-shot video depth estimation. We offer models of different scales to support a range of scenarios, with our smallest model capable of real-time performance at 30 FPS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12375v2-abstract-full').style.display = 'none'; document.getElementById('2501.12375v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://videodepthanything.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08313">arXiv:2501.08313</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.08313">pdf</a>, <a href="https://arxiv.org/format/2501.08313">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MiniMax-01: Scaling Foundation Models with Lightning Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=MiniMax"> MiniMax</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+A">Aonian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+B">Bangwei Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Shan%2C+B">Boji Shan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+C">Cheng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chunhao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+C">Congchao Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+D">Da Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Jiao%2C+E">Enwei Jiao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Gengxin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+G">Guojun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Haohai Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+H">Houze Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jiadai Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+J">Jiaqi Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+J">Jiayuan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+J">Jingtao Han</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jingyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+J">Junbin Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Junhao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+J">Junjie Yan</a> , et al. (65 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08313v1-abstract-short" style="display: inline;"> We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, which are comparable to top-tier models while offering superior capabilities in processing longer contexts. The core lies in lightning attention and its efficient scaling. To maximize computational capacity, we integrate it with Mixture of Experts (MoE), creating a model with 32 experts and 456 billion total parameters, o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08313v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08313v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08313v1-abstract-full" style="display: none;"> We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, which are comparable to top-tier models while offering superior capabilities in processing longer contexts. The core lies in lightning attention and its efficient scaling. To maximize computational capacity, we integrate it with Mixture of Experts (MoE), creating a model with 32 experts and 456 billion total parameters, of which 45.9 billion are activated for each token. We develop an optimized parallel strategy and highly efficient computation-communication overlap techniques for MoE and lightning attention. This approach enables us to conduct efficient training and inference on models with hundreds of billions of parameters across contexts spanning millions of tokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens during training and extrapolate to 4 million tokens during inference at an affordable cost. Our vision-language model, MiniMax-VL-01 is built through continued training with 512 billion vision-language tokens. Experiments on both standard and in-house benchmarks show that our models match the performance of state-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32 times longer context window. We publicly release MiniMax-01 at https://github.com/MiniMax-AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08313v1-abstract-full').style.display = 'none'; document.getElementById('2501.08313v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A technical report from MiniMax. The authors are listed in alphabetical order. We open-sourced our MiniMax-01 at https://github.com/MiniMax-AI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03884">arXiv:2501.03884</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.03884">pdf</a>, <a href="https://arxiv.org/format/2501.03884">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AlphaPO -- Reward shape matters for LLM alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+A">Aman Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Shao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Q">Qingquan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Sirou Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+J">Jiwoo Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Saha%2C+A">Ankan Saha</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+V">Viral Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+N">Noah Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+E">Eunki Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jason Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Pillai%2C+N">Natesh Pillai</a>, <a href="/search/cs?searchtype=author&amp;query=Keerthi%2C+S+S">S. Sathiya Keerthi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03884v1-abstract-short" style="display: inline;"> Reinforcement Learning with Human Feedback (RLHF) and its variants have made huge strides toward the effective alignment of large language models (LLMs) to follow instructions and reflect human values. More recently, Direct Alignment Algorithms (DAAs) have emerged in which the reward modeling stage of RLHF is skipped by characterizing the reward directly as a function of the policy being learned.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03884v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03884v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03884v1-abstract-full" style="display: none;"> Reinforcement Learning with Human Feedback (RLHF) and its variants have made huge strides toward the effective alignment of large language models (LLMs) to follow instructions and reflect human values. More recently, Direct Alignment Algorithms (DAAs) have emerged in which the reward modeling stage of RLHF is skipped by characterizing the reward directly as a function of the policy being learned. Examples include Direct Preference Optimization (DPO) and Simple Preference Optimization (SimPO). These methods often suffer from likelihood displacement, a phenomenon by which the probabilities of preferred responses are often reduced undesirably. In this paper, we argue that, for DAAs the reward (function) shape matters. We introduce AlphaPO, a new DAA method that leverages an $伪$-parameter to help change the shape of the reward function beyond the standard log reward. AlphaPO helps maintain fine-grained control over likelihood displacement and over-optimization. Compared to SimPO, one of the best performing DAAs, AlphaPO leads to about 7\% to 10\% relative improvement in alignment performance for the instruct versions of Mistral-7B and Llama3-8B. The analysis and results presented highlight the importance of the reward shape, and how one can systematically change it to affect training dynamics, as well as improve alignment performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03884v1-abstract-full').style.display = 'none'; document.getElementById('2501.03884v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02706">arXiv:2501.02706</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.02706">pdf</a>, <a href="https://arxiv.org/format/2501.02706">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multilevel Semantic-Aware Model for AI-Generated Video Quality Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaze Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Haoran Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shiding Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Junwei He</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haozhao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02706v1-abstract-short" style="display: inline;"> The rapid development of diffusion models has greatly advanced AI-generated videos in terms of length and consistency recently, yet assessing AI-generated videos still remains challenging. Previous approaches have often focused on User-Generated Content(UGC), but few have targeted AI-Generated Video Quality Assessment methods. In this work, we introduce MSA-VQA, a Multilevel Semantic-Aware Model f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02706v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02706v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02706v1-abstract-full" style="display: none;"> The rapid development of diffusion models has greatly advanced AI-generated videos in terms of length and consistency recently, yet assessing AI-generated videos still remains challenging. Previous approaches have often focused on User-Generated Content(UGC), but few have targeted AI-Generated Video Quality Assessment methods. In this work, we introduce MSA-VQA, a Multilevel Semantic-Aware Model for AI-Generated Video Quality Assessment, which leverages CLIP-based semantic supervision and cross-attention mechanisms. Our hierarchical framework analyzes video content at three levels: frame, segment, and video. We propose a Prompt Semantic Supervision Module using text encoder of CLIP to ensure semantic consistency between videos and conditional prompts. Additionally, we propose the Semantic Mutation-aware Module to capture subtle variations between frames. Extensive experiments demonstrate our method achieves state-of-the-art results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02706v1-abstract-full').style.display = 'none'; document.getElementById('2501.02706v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02506">arXiv:2501.02506</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.02506">pdf</a>, <a href="https://arxiv.org/format/2501.02506">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ToolHop: A Query-Driven Benchmark for Evaluating Large Language Models in Multi-Hop Tool Use </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Junjie Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+Z">Zhengyin Du</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+X">Xuesong Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+W">Weijian Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yufei Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zehui Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zaiyuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Sining Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Xi%2C+Z">Zhiheng Xi</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+S">Siyu Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Gui%2C+T">Tao Gui</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiecao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02506v2-abstract-short" style="display: inline;"> Effective evaluation of multi-hop tool use is critical for analyzing the understanding, reasoning, and function-calling capabilities of large language models (LLMs). However, progress has been hindered by a lack of reliable evaluation datasets. To address this, we present ToolHop, a dataset comprising 995 user queries and 3,912 associated tools, specifically designed for rigorous evaluation of mul&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02506v2-abstract-full').style.display = 'inline'; document.getElementById('2501.02506v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02506v2-abstract-full" style="display: none;"> Effective evaluation of multi-hop tool use is critical for analyzing the understanding, reasoning, and function-calling capabilities of large language models (LLMs). However, progress has been hindered by a lack of reliable evaluation datasets. To address this, we present ToolHop, a dataset comprising 995 user queries and 3,912 associated tools, specifically designed for rigorous evaluation of multi-hop tool use. ToolHop ensures diverse queries, meaningful interdependencies, locally executable tools, detailed feedback, and verifiable answers through a novel query-driven data construction approach that includes tool creation, document refinement, and code generation. We evaluate 14 LLMs across five model families (i.e., LLaMA3.1, Qwen2.5, Gemini1.5, Claude3.5, and GPT), uncovering significant challenges in handling multi-hop tool-use scenarios. The leading model, GPT-4o, achieves an accuracy of 49.04%, underscoring substantial room for improvement. Further analysis reveals variations in tool-use strategies for various families, offering actionable insights to guide the development of more effective approaches. Code and data can be found in https://huggingface.co/datasets/bytedance-research/ToolHop. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02506v2-abstract-full').style.display = 'none'; document.getElementById('2501.02506v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01368">arXiv:2501.01368</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.01368">pdf</a>, <a href="https://arxiv.org/format/2501.01368">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Test-time Controllable Image Generation by Explicit Spatial Constraint Enforcement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Z. Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">B. Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+J">J. Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">L. Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">S. Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">J. Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01368v1-abstract-short" style="display: inline;"> Recent text-to-image generation favors various forms of spatial conditions, e.g., masks, bounding boxes, and key points. However, the majority of the prior art requires form-specific annotations to fine-tune the original model, leading to poor test-time generalizability. Meanwhile, existing training-free methods work well only with simplified prompts and spatial conditions. In this work, we propos&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01368v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01368v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01368v1-abstract-full" style="display: none;"> Recent text-to-image generation favors various forms of spatial conditions, e.g., masks, bounding boxes, and key points. However, the majority of the prior art requires form-specific annotations to fine-tune the original model, leading to poor test-time generalizability. Meanwhile, existing training-free methods work well only with simplified prompts and spatial conditions. In this work, we propose a novel yet generic test-time controllable generation method that aims at natural text prompts and complex conditions. Specifically, we decouple spatial conditions into semantic and geometric conditions and then enforce their consistency during the image-generation process individually. As for the former, we target bridging the gap between the semantic condition and text prompts, as well as the gap between such condition and the attention map from diffusion models. To achieve this, we propose to first complete the prompt w.r.t. semantic condition, and then remove the negative impact of distracting prompt words by measuring their statistics in attention maps as well as distances in word space w.r.t. this condition. To further cope with the complex geometric conditions, we introduce a geometric transform module, in which Region-of-Interests will be identified in attention maps and further used to translate category-wise latents w.r.t. geometric condition. More importantly, we propose a diffusion-based latents-refill method to explicitly remove the impact of latents at the RoI, reducing the artifacts on generated images. Experiments on Coco-stuff dataset showcase 30$\%$ relative boost compared to SOTA training-free methods on layout consistency evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01368v1-abstract-full').style.display = 'none'; document.getElementById('2501.01368v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01187">arXiv:2501.01187</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.01187">pdf</a>, <a href="https://arxiv.org/format/2501.01187">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> NET-SA: An Efficient Secure Aggregation Architecture Based on In-Network Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+Q">Qingqing Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shuyong Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhiyuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yujun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01187v1-abstract-short" style="display: inline;"> Privacy-preserving machine learning (PPML) enables clients to collaboratively train deep learning models without sharing private datasets, but faces privacy leakage risks due to gradient leakage attacks. Prevailing methods leverage secure aggregation strategies to enhance PPML, where clients leverage masks and secret sharing to further protect gradient data while tolerating participant dropouts. T&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01187v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01187v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01187v1-abstract-full" style="display: none;"> Privacy-preserving machine learning (PPML) enables clients to collaboratively train deep learning models without sharing private datasets, but faces privacy leakage risks due to gradient leakage attacks. Prevailing methods leverage secure aggregation strategies to enhance PPML, where clients leverage masks and secret sharing to further protect gradient data while tolerating participant dropouts. These methods, however, require frequent inter-client communication to negotiate keys and perform secret sharing, leading to substantial communication overhead. To tackle this issue, we propose NET-SA, an efficient secure aggregation architecture for PPML based on in-network computing. NET-SA employs seed homomorphic pseudorandom generators for local gradient masking and utilizes programmable switches for seed aggregation. Accurate and secure gradient aggregation is then performed on the central server based on masked gradients and aggregated seeds. This design effectively reduces communication overhead due to eliminating the communication-intensive phases of seed agreement and secret sharing, with enhanced dropout tolerance due to overcoming the threshold limit of secret sharing. Extensive experiments on server clusters and Intel Tofino programmable switch demonstrate that NET-SA achieves up to 77x and 12x enhancements in runtime and 2x decrease in total client communication cost compared with state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01187v1-abstract-full').style.display = 'none'; document.getElementById('2501.01187v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 8 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20993">arXiv:2412.20993</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.20993">pdf</a>, <a href="https://arxiv.org/format/2412.20993">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Efficiently Serving LLM Reasoning Programs with Certaindex </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Y">Yichao Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Junda Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Siqi Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Z">Zheyu Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Z">Zhongdongming Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+A">Aurick Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20993v1-abstract-short" style="display: inline;"> The rapid evolution of large language models (LLMs) has unlocked their capabilities in advanced reasoning tasks like mathematical problem-solving, code generation, and legal analysis. Central to this progress are inference-time reasoning algorithms, which refine outputs by exploring multiple solution paths, at the cost of increasing compute demands and response latencies. Existing serving systems&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20993v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20993v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20993v1-abstract-full" style="display: none;"> The rapid evolution of large language models (LLMs) has unlocked their capabilities in advanced reasoning tasks like mathematical problem-solving, code generation, and legal analysis. Central to this progress are inference-time reasoning algorithms, which refine outputs by exploring multiple solution paths, at the cost of increasing compute demands and response latencies. Existing serving systems fail to adapt to the scaling behaviors of these algorithms or the varying difficulty of queries, leading to inefficient resource use and unmet latency targets. We present Dynasor, a system that optimizes inference-time compute for LLM reasoning queries. Unlike traditional engines, Dynasor tracks and schedules requests within reasoning queries and uses Certaindex, a proxy that measures statistical reasoning progress based on model certainty, to guide compute allocation dynamically. Dynasor co-adapts scheduling with reasoning progress: it allocates more compute to hard queries, reduces compute for simpler ones, and terminates unpromising queries early, balancing accuracy, latency, and cost. On diverse datasets and algorithms, Dynasor reduces compute by up to 50% in batch processing and sustaining 3.3x higher query rates or 4.7x tighter latency SLOs in online serving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20993v1-abstract-full').style.display = 'none'; document.getElementById('2412.20993v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20504">arXiv:2412.20504</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.20504">pdf</a>, <a href="https://arxiv.org/format/2412.20504">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> ReTaKe: Reducing Temporal and Knowledge Redundancy for Long Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Si%2C+Q">Qingyi Si</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jianlong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shiyu Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+L">Li Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+L">Liqiang Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20504v2-abstract-short" style="display: inline;"> Video Large Language Models (VideoLLMs) have achieved remarkable progress in video understanding. However, existing VideoLLMs often inherit the limitations of their backbone LLMs in handling long sequences, leading to challenges for long video understanding. Common solutions either simply uniformly sample videos&#39; frames or compress visual tokens, which focus primarily on low-level temporal visual&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20504v2-abstract-full').style.display = 'inline'; document.getElementById('2412.20504v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20504v2-abstract-full" style="display: none;"> Video Large Language Models (VideoLLMs) have achieved remarkable progress in video understanding. However, existing VideoLLMs often inherit the limitations of their backbone LLMs in handling long sequences, leading to challenges for long video understanding. Common solutions either simply uniformly sample videos&#39; frames or compress visual tokens, which focus primarily on low-level temporal visual redundancy, overlooking high-level knowledge redundancy. This limits the achievable compression rate with minimal loss. To this end. we introduce a training-free method, $\textbf{ReTaKe}$, containing two novel modules DPSelect and PivotKV, to jointly model and reduce both temporal visual redundancy and knowledge redundancy for long video understanding. Specifically, DPSelect identifies keyframes with local maximum peak distance based on their visual features, which are closely aligned with human video perception. PivotKV employs the obtained keyframes as pivots and conducts KV-Cache compression for the non-pivot tokens with low attention scores, which are derived from the learned prior knowledge of LLMs. Experiments on benchmarks VideoMME, MLVU, and LVBench, show that ReTaKe can support 4x longer video sequences with minimal performance loss (&lt;1%) and outperform all similar-size VideoLLMs with 3%-5%, even surpassing or on par with much larger ones. Our code is available at https://github.com/SCZwangxiao/video-ReTaKe <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20504v2-abstract-full').style.display = 'none'; document.getElementById('2412.20504v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Update performance in MLVU-dev and LVBench</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19140">arXiv:2412.19140</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.19140">pdf</a>, <a href="https://arxiv.org/format/2412.19140">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> SILC-EFSA: Self-aware In-context Learning Correction for Entity-level Financial Sentiment Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Senbin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+C">Chenyuan He</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hongde Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+P">Pengcheng Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Hanjie Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Y">Yuchen Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+Y">Yuxiang Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Zan%2C+H">Hongying Zan</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+M">Min Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19140v1-abstract-short" style="display: inline;"> In recent years, fine-grained sentiment analysis in finance has gained significant attention, but the scarcity of entity-level datasets remains a key challenge. To address this, we have constructed the largest English and Chinese financial entity-level sentiment analysis datasets to date. Building on this foundation, we propose a novel two-stage sentiment analysis approach called Self-aware In-con&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19140v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19140v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19140v1-abstract-full" style="display: none;"> In recent years, fine-grained sentiment analysis in finance has gained significant attention, but the scarcity of entity-level datasets remains a key challenge. To address this, we have constructed the largest English and Chinese financial entity-level sentiment analysis datasets to date. Building on this foundation, we propose a novel two-stage sentiment analysis approach called Self-aware In-context Learning Correction (SILC). The first stage involves fine-tuning a base large language model to generate pseudo-labeled data specific to our task. In the second stage, we train a correction model using a GNN-based example retriever, which is informed by the pseudo-labeled data. This two-stage strategy has allowed us to achieve state-of-the-art performance on the newly constructed datasets, advancing the field of financial sentiment analysis. In a case study, we demonstrate the enhanced practical utility of our data and methods in monitoring the cryptocurrency market. Our datasets and code are available at https://github.com/NLP-Bin/SILC-EFSA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19140v1-abstract-full').style.display = 'none'; document.getElementById('2412.19140v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is to be published in the Proceedings of the 31st International Conference on Computational Linguistics (COLING 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16429">arXiv:2412.16429</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.16429">pdf</a>, <a href="https://arxiv.org/format/2412.16429">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LearnLM: Improving Gemini for Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=LearnLM+Team"> LearnLM Team</a>, <a href="/search/cs?searchtype=author&amp;query=Modi%2C+A">Abhinit Modi</a>, <a href="/search/cs?searchtype=author&amp;query=Veerubhotla%2C+A+S">Aditya Srikanth Veerubhotla</a>, <a href="/search/cs?searchtype=author&amp;query=Rysbek%2C+A">Aliya Rysbek</a>, <a href="/search/cs?searchtype=author&amp;query=Huber%2C+A">Andrea Huber</a>, <a href="/search/cs?searchtype=author&amp;query=Wiltshire%2C+B">Brett Wiltshire</a>, <a href="/search/cs?searchtype=author&amp;query=Veprek%2C+B">Brian Veprek</a>, <a href="/search/cs?searchtype=author&amp;query=Gillick%2C+D">Daniel Gillick</a>, <a href="/search/cs?searchtype=author&amp;query=Kasenberg%2C+D">Daniel Kasenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Ahmed%2C+D">Derek Ahmed</a>, <a href="/search/cs?searchtype=author&amp;query=Jurenka%2C+I">Irina Jurenka</a>, <a href="/search/cs?searchtype=author&amp;query=Cohan%2C+J">James Cohan</a>, <a href="/search/cs?searchtype=author&amp;query=She%2C+J">Jennifer She</a>, <a href="/search/cs?searchtype=author&amp;query=Wilkowski%2C+J">Julia Wilkowski</a>, <a href="/search/cs?searchtype=author&amp;query=Alarakyia%2C+K">Kaiz Alarakyia</a>, <a href="/search/cs?searchtype=author&amp;query=McKee%2C+K+R">Kevin R. McKee</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lisa Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Kunesch%2C+M">Markus Kunesch</a>, <a href="/search/cs?searchtype=author&amp;query=Schaekermann%2C+M">Mike Schaekermann</a>, <a href="/search/cs?searchtype=author&amp;query=P%C3%AEslar%2C+M">Miruna P卯slar</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+N">Nikhil Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Mahmoudieh%2C+P">Parsa Mahmoudieh</a>, <a href="/search/cs?searchtype=author&amp;query=Jhun%2C+P">Paul Jhun</a>, <a href="/search/cs?searchtype=author&amp;query=Wiltberger%2C+S">Sara Wiltberger</a>, <a href="/search/cs?searchtype=author&amp;query=Mohamed%2C+S">Shakir Mohamed</a> , et al. (21 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16429v2-abstract-short" style="display: inline;"> Today&#39;s generative AI systems are tuned to present information by default rather than engage users in service of learning as a human tutor would. To address the wide range of potential education use cases for these systems, we reframe the challenge of injecting pedagogical behavior as one of \textit{pedagogical instruction following}, where training and evaluation examples include system-level ins&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16429v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16429v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16429v2-abstract-full" style="display: none;"> Today&#39;s generative AI systems are tuned to present information by default rather than engage users in service of learning as a human tutor would. To address the wide range of potential education use cases for these systems, we reframe the challenge of injecting pedagogical behavior as one of \textit{pedagogical instruction following}, where training and evaluation examples include system-level instructions describing the specific pedagogy attributes present or desired in subsequent model turns. This framing avoids committing our models to any particular definition of pedagogy, and instead allows teachers or developers to specify desired model behavior. It also clears a path to improving Gemini models for learning -- by enabling the addition of our pedagogical data to post-training mixtures -- alongside their rapidly expanding set of capabilities. Both represent important changes from our initial tech report. We show how training with pedagogical instruction following produces a LearnLM model (available on Google AI Studio) that is preferred substantially by expert raters across a diverse set of learning scenarios, with average preference strengths of 31\% over GPT-4o, 11\% over Claude 3.5, and 13\% over the Gemini 1.5 Pro model LearnLM was based on. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16429v2-abstract-full').style.display = 'none'; document.getElementById('2412.16429v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15606">arXiv:2412.15606</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.15606">pdf</a>, <a href="https://arxiv.org/format/2412.15606">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Agent Tuning: Building a VLM-Driven Agent for Efficient Tool Usage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Z">Zhi Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bofei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+P">Pengxiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaojian Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+T">Tao Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+Y">Yue Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yuwei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+Y">Yunde Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Song-Chun Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15606v2-abstract-short" style="display: inline;"> The advancement of large language models (LLMs) prompts the development of multi-modal agents, which are used as a controller to call external tools, providing a feasible way to solve practical tasks. In this paper, we propose a multi-modal agent tuning method that automatically generates multi-modal tool-usage data and tunes a vision-language model (VLM) as the controller for powerful tool-usage&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15606v2-abstract-full').style.display = 'inline'; document.getElementById('2412.15606v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15606v2-abstract-full" style="display: none;"> The advancement of large language models (LLMs) prompts the development of multi-modal agents, which are used as a controller to call external tools, providing a feasible way to solve practical tasks. In this paper, we propose a multi-modal agent tuning method that automatically generates multi-modal tool-usage data and tunes a vision-language model (VLM) as the controller for powerful tool-usage reasoning. To preserve the data quality, we prompt the GPT-4o mini model to generate queries, files, and trajectories, followed by query-file and trajectory verifiers. Based on the data synthesis pipeline, we collect the MM-Traj dataset that contains 20K tasks with trajectories of tool usage. Then, we develop the T3-Agent via \underline{T}rajectory \underline{T}uning on VLMs for \underline{T}ool usage using MM-Traj. Evaluations on the GTA and GAIA benchmarks show that the T3-Agent consistently achieves improvements on two popular VLMs: MiniCPM-V-8.5B and {Qwen2-VL-7B}, which outperforms untrained VLMs by $20\%$, showing the effectiveness of the proposed data synthesis pipeline, leading to high-quality data for tool-usage capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15606v2-abstract-full').style.display = 'none'; document.getElementById('2412.15606v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025, https://mat-agent.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15551">arXiv:2412.15551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.15551">pdf</a>, <a href="https://arxiv.org/ps/2412.15551">ps</a>, <a href="https://arxiv.org/format/2412.15551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> New record-breaking binary linear codes constructed from group codes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Cong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shixin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiuyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15551v1-abstract-short" style="display: inline;"> In this paper, we employ group rings and automorphism groups of binary linear codes to construct new record-breaking binary linear codes. We consider the semidirect product of abelian groups and cyclic groups and use these groups to construct linear codes. Finally, we obtain some linear codes which have better parameters than the code in \cite{bib5}. All the calculation results and corresponding d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15551v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15551v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15551v1-abstract-full" style="display: none;"> In this paper, we employ group rings and automorphism groups of binary linear codes to construct new record-breaking binary linear codes. We consider the semidirect product of abelian groups and cyclic groups and use these groups to construct linear codes. Finally, we obtain some linear codes which have better parameters than the code in \cite{bib5}. All the calculation results and corresponding data are listed in the paper or posted online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15551v1-abstract-full').style.display = 'none'; document.getElementById('2412.15551v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">For now, this is the initial version which was completed in April 2024, and the authors will add some new results in the future</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15206">arXiv:2412.15206</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.15206">pdf</a>, <a href="https://arxiv.org/format/2412.15206">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> AutoTrust: Benchmarking Trustworthiness in Large Vision Language Models for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xing%2C+S">Shuo Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Hua%2C+H">Hongyuan Hua</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+X">Xiangbo Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shenzhe Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Renjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+K">Kexin Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiaopeng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Heng Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+T">Tianbao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhangyang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+H">Huaxiu Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Tu%2C+Z">Zhengzhong Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15206v1-abstract-short" style="display: inline;"> Recent advancements in large vision language models (VLMs) tailored for autonomous driving (AD) have shown strong scene understanding and reasoning capabilities, making them undeniable candidates for end-to-end driving systems. However, limited work exists on studying the trustworthiness of DriveVLMs -- a critical factor that directly impacts public transportation safety. In this paper, we introdu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15206v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15206v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15206v1-abstract-full" style="display: none;"> Recent advancements in large vision language models (VLMs) tailored for autonomous driving (AD) have shown strong scene understanding and reasoning capabilities, making them undeniable candidates for end-to-end driving systems. However, limited work exists on studying the trustworthiness of DriveVLMs -- a critical factor that directly impacts public transportation safety. In this paper, we introduce AutoTrust, a comprehensive trustworthiness benchmark for large vision-language models in autonomous driving (DriveVLMs), considering diverse perspectives -- including trustfulness, safety, robustness, privacy, and fairness. We constructed the largest visual question-answering dataset for investigating trustworthiness issues in driving scenarios, comprising over 10k unique scenes and 18k queries. We evaluated six publicly available VLMs, spanning from generalist to specialist, from open-source to commercial models. Our exhaustive evaluations have unveiled previously undiscovered vulnerabilities of DriveVLMs to trustworthiness threats. Specifically, we found that the general VLMs like LLaVA-v1.6 and GPT-4o-mini surprisingly outperform specialized models fine-tuned for driving in terms of overall trustworthiness. DriveVLMs like DriveLM-Agent are particularly vulnerable to disclosing sensitive information. Additionally, both generalist and specialist VLMs remain susceptible to adversarial attacks and struggle to ensure unbiased decision-making across diverse environments and populations. Our findings call for immediate and decisive action to address the trustworthiness of DriveVLMs -- an issue of critical importance to public safety and the welfare of all citizens relying on autonomous transportation systems. Our benchmark is publicly available at \url{https://github.com/taco-group/AutoTrust}, and the leaderboard is released at \url{https://taco-group.github.io/AutoTrust/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15206v1-abstract-full').style.display = 'none'; document.getElementById('2412.15206v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">55 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14482">arXiv:2412.14482</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.14482">pdf</a>, <a href="https://arxiv.org/format/2412.14482">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Embedding high-resolution touch across robotic hands enables adaptive human-like grasping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zihang Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wanlin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tengyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Boren Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+K">Kai Du</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hangxin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yixin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qining Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Althoefer%2C+K">Kaspar Althoefer</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Song-Chun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14482v1-abstract-short" style="display: inline;"> Developing robotic hands that adapt to real-world dynamics remains a fundamental challenge in robotics and machine intelligence. Despite significant advances in replicating human hand kinematics and control algorithms, robotic systems still struggle to match human capabilities in dynamic environments, primarily due to inadequate tactile feedback. To bridge this gap, we present F-TAC Hand, a biomim&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14482v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14482v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14482v1-abstract-full" style="display: none;"> Developing robotic hands that adapt to real-world dynamics remains a fundamental challenge in robotics and machine intelligence. Despite significant advances in replicating human hand kinematics and control algorithms, robotic systems still struggle to match human capabilities in dynamic environments, primarily due to inadequate tactile feedback. To bridge this gap, we present F-TAC Hand, a biomimetic hand featuring high-resolution tactile sensing (0.1mm spatial resolution) across 70% of its surface area. Through optimized hand design, we overcome traditional challenges in integrating high-resolution tactile sensors while preserving the full range of motion. The hand, powered by our generative algorithm that synthesizes human-like hand configurations, demonstrates robust grasping capabilities in dynamic real-world conditions. Extensive evaluation across 600 real-world trials demonstrates that this tactile-embodied system significantly outperforms non-tactile alternatives in complex manipulation tasks (p&lt;0.0001). These results provide empirical evidence for the critical role of rich tactile embodiment in developing advanced robotic intelligence, offering new perspectives on the relationship between physical sensing capabilities and intelligent behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14482v1-abstract-full').style.display = 'none'; document.getElementById('2412.14482v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14449">arXiv:2412.14449</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.14449">pdf</a>, <a href="https://arxiv.org/format/2412.14449">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Color Enhancement for V-PCC Compressed Point Cloud via 2D Attribute Map Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bao%2C+J">Jingwei Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zeliang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shuyuan Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Yeung%2C+S+A">Siu-Kei Au Yeung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14449v1-abstract-short" style="display: inline;"> Video-based point cloud compression (V-PCC) converts the dynamic point cloud data into video sequences using traditional video codecs for efficient encoding. However, this lossy compression scheme introduces artifacts that degrade the color attributes of the data. This paper introduces a framework designed to enhance the color quality in the V-PCC compressed point clouds. We propose the lightweigh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14449v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14449v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14449v1-abstract-full" style="display: none;"> Video-based point cloud compression (V-PCC) converts the dynamic point cloud data into video sequences using traditional video codecs for efficient encoding. However, this lossy compression scheme introduces artifacts that degrade the color attributes of the data. This paper introduces a framework designed to enhance the color quality in the V-PCC compressed point clouds. We propose the lightweight de-compression Unet (LDC-Unet), a 2D neural network, to optimize the projection maps generated during V-PCC encoding. The optimized 2D maps will then be back-projected to the 3D space to enhance the corresponding point cloud attributes. Additionally, we introduce a transfer learning strategy and develop a customized natural image dataset for the initial training. The model was then fine-tuned using the projection maps of the compressed point clouds. The whole strategy effectively addresses the scarcity of point cloud training data. Our experiments, conducted on the public 8i voxelized full bodies long sequences (8iVSLF) dataset, demonstrate the effectiveness of our proposed method in improving the color quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14449v1-abstract-full').style.display = 'none'; document.getElementById('2412.14449v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE VCIP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14148">arXiv:2412.14148</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.14148">pdf</a>, <a href="https://arxiv.org/format/2412.14148">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MCMat: Multiview-Consistent and Physically Accurate PBR Material Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shenhao Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+L">Lingteng Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+X">Xiaodong Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhengyi Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yuxiao He</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xiaoguang Han</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+X">Xun Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Siyu Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+W">Weihao Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zilong Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+H">Hao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14148v1-abstract-short" style="display: inline;"> Existing 2D methods utilize UNet-based diffusion models to generate multi-view physically-based rendering (PBR) maps but struggle with multi-view inconsistency, while some 3D methods directly generate UV maps, encountering generalization issues due to the limited 3D data. To address these problems, we propose a two-stage approach, including multi-view generation and UV materials refinement. In the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14148v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14148v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14148v1-abstract-full" style="display: none;"> Existing 2D methods utilize UNet-based diffusion models to generate multi-view physically-based rendering (PBR) maps but struggle with multi-view inconsistency, while some 3D methods directly generate UV maps, encountering generalization issues due to the limited 3D data. To address these problems, we propose a two-stage approach, including multi-view generation and UV materials refinement. In the generation stage, we adopt a Diffusion Transformer (DiT) model to generate PBR materials, where both the specially designed multi-branch DiT and reference-based DiT blocks adopt a global attention mechanism to promote feature interaction and fusion between different views, thereby improving multi-view consistency. In addition, we adopt a PBR-based diffusion loss to ensure that the generated materials align with realistic physical principles. In the refinement stage, we propose a material-refined DiT that performs inpainting in empty areas and enhances details in UV space. Except for the normal condition, this refinement also takes the material map from the generation stage as an additional condition to reduce the learning difficulty and improve generalization. Extensive experiments show that our method achieves state-of-the-art performance in texturing 3D objects with PBR materials and provides significant advantages for graphics relighting applications. Project Page: https://lingtengqiu.github.io/2024/MCMat/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14148v1-abstract-full').style.display = 'none'; document.getElementById('2412.14148v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://lingtengqiu.github.io/2024/MCMat/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13508">arXiv:2412.13508</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.13508">pdf</a>, <a href="https://arxiv.org/format/2412.13508">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Plug-and-Play Tri-Branch Invertible Block for Image Rescaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bao%2C+J">Jingwei Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Hao%2C+J">Jinhua Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+P">Pengcheng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Ming Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+C">Chao Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shuyuan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13508v1-abstract-short" style="display: inline;"> High-resolution (HR) images are commonly downscaled to low-resolution (LR) to reduce bandwidth, followed by upscaling to restore their original details. Recent advancements in image rescaling algorithms have employed invertible neural networks (INNs) to create a unified framework for downscaling and upscaling, ensuring a one-to-one mapping between LR and HR images. Traditional methods, utilizing d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13508v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13508v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13508v1-abstract-full" style="display: none;"> High-resolution (HR) images are commonly downscaled to low-resolution (LR) to reduce bandwidth, followed by upscaling to restore their original details. Recent advancements in image rescaling algorithms have employed invertible neural networks (INNs) to create a unified framework for downscaling and upscaling, ensuring a one-to-one mapping between LR and HR images. Traditional methods, utilizing dual-branch based vanilla invertible blocks, process high-frequency and low-frequency information separately, often relying on specific distributions to model high-frequency components. However, processing the low-frequency component directly in the RGB domain introduces channel redundancy, limiting the efficiency of image reconstruction. To address these challenges, we propose a plug-and-play tri-branch invertible block (T-InvBlocks) that decomposes the low-frequency branch into luminance (Y) and chrominance (CbCr) components, reducing redundancy and enhancing feature processing. Additionally, we adopt an all-zero mapping strategy for high-frequency components during upscaling, focusing essential rescaling information within the LR image. Our T-InvBlocks can be seamlessly integrated into existing rescaling models, improving performance in both general rescaling tasks and scenarios involving lossy compression. Extensive experiments confirm that our method advances the state of the art in HR image reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13508v1-abstract-full').style.display = 'none'; document.getElementById('2412.13508v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025. Code is available at https://github.com/Jingwei-Bao/T-InvBlocks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13119">arXiv:2412.13119</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.13119">pdf</a>, <a href="https://arxiv.org/format/2412.13119">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.61981/ZFSH2303">10.61981/ZFSH2303 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Flight Patterns for Swarms of Drones </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shuqin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Ghandeharizadeh%2C+S">Shahram Ghandeharizadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13119v1-abstract-short" style="display: inline;"> We present flight patterns for a collision-free passage of swarms of drones through one or more openings. The narrow openings provide drones with access to an infrastructure component such as charging stations to charge their depleted batteries and hangars for storage. The flight patterns are a staging area (queues) that match the rate at which an infrastructure component and its openings process&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13119v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13119v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13119v1-abstract-full" style="display: none;"> We present flight patterns for a collision-free passage of swarms of drones through one or more openings. The narrow openings provide drones with access to an infrastructure component such as charging stations to charge their depleted batteries and hangars for storage. The flight patterns are a staging area (queues) that match the rate at which an infrastructure component and its openings process drones. They prevent collisions and may implement different policies that control the order in which drones pass through an opening. We illustrate the flight patterns with a 3D display that uses drones configured with light sources to illuminate shapes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13119v1-abstract-full').style.display = 'none'; document.getElementById('2412.13119v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Appeared in the First International Conference on Holodecks, December 15, 2023. Shuqin Zhou and Shahram Ghandeharizadeh. Flight Patterns for Swarms of Drones. In the Proceedings of the First International Conference on Holodecks (Holodecks &#39;23), December 15 2023, Los Angeles, California, USA, 29-33. https://doi.org/10.61981/ZFSH2303</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11579">arXiv:2412.11579</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.11579">pdf</a>, <a href="https://arxiv.org/format/2412.11579">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SweepEvGS: Event-Based 3D Gaussian Splatting for Macro and Micro Radiance Field Rendering from a Single Sweep </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jingqian Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shuo Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chutian Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+B">Boxin Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Lam%2C+E+Y">Edmund Y. Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11579v1-abstract-short" style="display: inline;"> Recent advancements in 3D Gaussian Splatting (3D-GS) have demonstrated the potential of using 3D Gaussian primitives for high-speed, high-fidelity, and cost-efficient novel view synthesis from continuously calibrated input views. However, conventional methods require high-frame-rate dense and high-quality sharp images, which are time-consuming and inefficient to capture, especially in dynamic envi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11579v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11579v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11579v1-abstract-full" style="display: none;"> Recent advancements in 3D Gaussian Splatting (3D-GS) have demonstrated the potential of using 3D Gaussian primitives for high-speed, high-fidelity, and cost-efficient novel view synthesis from continuously calibrated input views. However, conventional methods require high-frame-rate dense and high-quality sharp images, which are time-consuming and inefficient to capture, especially in dynamic environments. Event cameras, with their high temporal resolution and ability to capture asynchronous brightness changes, offer a promising alternative for more reliable scene reconstruction without motion blur. In this paper, we propose SweepEvGS, a novel hardware-integrated method that leverages event cameras for robust and accurate novel view synthesis across various imaging settings from a single sweep. SweepEvGS utilizes the initial static frame with dense event streams captured during a single camera sweep to effectively reconstruct detailed scene views. We also introduce different real-world hardware imaging systems for real-world data collection and evaluation for future research. We validate the robustness and efficiency of SweepEvGS through experiments in three different imaging settings: synthetic objects, real-world macro-level, and real-world micro-level view synthesis. Our results demonstrate that SweepEvGS surpasses existing methods in visual rendering quality, rendering speed, and computational efficiency, highlighting its potential for dynamic practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11579v1-abstract-full').style.display = 'none'; document.getElementById('2412.11579v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11325">arXiv:2412.11325</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.11325">pdf</a>, <a href="https://arxiv.org/format/2412.11325">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Sonicmesh: Enhancing 3D Human Mesh Reconstruction in Vision-Impaired Environments With Acoustic Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xiaoxuan Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wuyang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Hong Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Z">Zhaolong Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Sicheng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yansong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+R">Rui Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+J">Jiantao Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Gummeson%2C+J">Jeremy Gummeson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11325v1-abstract-short" style="display: inline;"> 3D Human Mesh Reconstruction (HMR) from 2D RGB images faces challenges in environments with poor lighting, privacy concerns, or occlusions. These weaknesses of RGB imaging can be complemented by acoustic signals, which are widely available, easy to deploy, and capable of penetrating obstacles. However, no existing methods effectively combine acoustic signals with RGB data for robust 3D HMR. The pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11325v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11325v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11325v1-abstract-full" style="display: none;"> 3D Human Mesh Reconstruction (HMR) from 2D RGB images faces challenges in environments with poor lighting, privacy concerns, or occlusions. These weaknesses of RGB imaging can be complemented by acoustic signals, which are widely available, easy to deploy, and capable of penetrating obstacles. However, no existing methods effectively combine acoustic signals with RGB data for robust 3D HMR. The primary challenges include the low-resolution images generated by acoustic signals and the lack of dedicated processing backbones. We introduce SonicMesh, a novel approach combining acoustic signals with RGB images to reconstruct 3D human mesh. To address the challenges of low resolution and the absence of dedicated processing backbones in images generated by acoustic signals, we modify an existing method, HRNet, for effective feature extraction. We also integrate a universal feature embedding technique to enhance the precision of cross-dimensional feature alignment, enabling SonicMesh to achieve high accuracy. Experimental results demonstrate that SonicMesh accurately reconstructs 3D human mesh in challenging environments such as occlusions, non-line-of-sight scenarios, and poor lighting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11325v1-abstract-full').style.display = 'none'; document.getElementById('2412.11325v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10799">arXiv:2412.10799</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.10799">pdf</a>, <a href="https://arxiv.org/format/2412.10799">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Improving Community-Participated Patrol for Anti-Poaching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yufei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y+E">Yixuan Even Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xuming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Duo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shibing Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+F">Fei Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10799v2-abstract-short" style="display: inline;"> Community engagement plays a critical role in anti-poaching efforts, yet existing mathematical models aimed at enhancing this engagement often overlook direct participation by community members as alternative patrollers. Unlike professional rangers, community members typically lack flexibility and experience, resulting in new challenges in optimizing patrol resource allocation. To address this gap&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10799v2-abstract-full').style.display = 'inline'; document.getElementById('2412.10799v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10799v2-abstract-full" style="display: none;"> Community engagement plays a critical role in anti-poaching efforts, yet existing mathematical models aimed at enhancing this engagement often overlook direct participation by community members as alternative patrollers. Unlike professional rangers, community members typically lack flexibility and experience, resulting in new challenges in optimizing patrol resource allocation. To address this gap, we propose a novel game-theoretic model for community-participated patrol, where a conservation agency strategically deploys both professional rangers and community members to safeguard wildlife against a best-responding poacher. In addition to a mixed-integer linear program formulation, we introduce a Two-Dimensional Binary Search algorithm and a novel Hybrid Waterfilling algorithm to efficiently solve the game in polynomial time. Through extensive experiments and a detailed case study focused on a protected tiger habitat in Northeast China, we demonstrate the effectiveness of our algorithms and the practical applicability of our model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10799v2-abstract-full').style.display = 'none'; document.getElementById('2412.10799v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10673">arXiv:2412.10673</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.10673">pdf</a>, <a href="https://arxiv.org/format/2412.10673">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Proposing and solving olympiad geometry with guided tree search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+J">Jiajun Song</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Siyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Y">Yitao Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y">Yuxi Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yixin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Song-Chun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10673v1-abstract-short" style="display: inline;"> Mathematics olympiads are prestigious competitions, with problem proposing and solving highly honored. Building artificial intelligence that proposes and solves olympiads presents an unresolved challenge in automated theorem discovery and proving, especially in geometry for its combination of numerical and spatial elements. We introduce TongGeometry, a Euclidean geometry system supporting tree-sea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10673v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10673v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10673v1-abstract-full" style="display: none;"> Mathematics olympiads are prestigious competitions, with problem proposing and solving highly honored. Building artificial intelligence that proposes and solves olympiads presents an unresolved challenge in automated theorem discovery and proving, especially in geometry for its combination of numerical and spatial elements. We introduce TongGeometry, a Euclidean geometry system supporting tree-search-based guided problem proposing and solving. The efficient geometry system establishes the most extensive repository of geometry theorems to date: within the same computational budget as the existing state-of-the-art, TongGeometry discovers 6.7 billion geometry theorems requiring auxiliary constructions, including 4.1 billion exhibiting geometric symmetry. Among them, 10 theorems were proposed to regional mathematical olympiads with 3 of TongGeometry&#39;s proposals selected in real competitions, earning spots in a national team qualifying exam or a top civil olympiad in China and the US. Guided by fine-tuned large language models, TongGeometry solved all International Mathematical Olympiad geometry in IMO-AG-30, outperforming gold medalists for the first time. It also surpasses the existing state-of-the-art across a broader spectrum of olympiad-level problems. The full capabilities of the system can be utilized on a consumer-grade machine, making the model more accessible and fostering widespread democratization of its use. By analogy, unlike existing systems that merely solve problems like students, TongGeometry acts like a geometry coach, discovering, presenting, and proving theorems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10673v1-abstract-full').style.display = 'none'; document.getElementById('2412.10673v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10321">arXiv:2412.10321</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.10321">pdf</a>, <a href="https://arxiv.org/format/2412.10321">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> AdvPrefix: An Objective for Nuanced LLM Jailbreaks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Sicheng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Amos%2C+B">Brandon Amos</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+Y">Yuandong Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+C">Chuan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Evtimov%2C+I">Ivan Evtimov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10321v1-abstract-short" style="display: inline;"> Many jailbreak attacks on large language models (LLMs) rely on a common objective: making the model respond with the prefix &#34;Sure, here is (harmful request)&#34;. While straightforward, this objective has two limitations: limited control over model behaviors, often resulting in incomplete or unrealistic responses, and a rigid format that hinders optimization. To address these limitations, we introduce&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10321v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10321v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10321v1-abstract-full" style="display: none;"> Many jailbreak attacks on large language models (LLMs) rely on a common objective: making the model respond with the prefix &#34;Sure, here is (harmful request)&#34;. While straightforward, this objective has two limitations: limited control over model behaviors, often resulting in incomplete or unrealistic responses, and a rigid format that hinders optimization. To address these limitations, we introduce AdvPrefix, a new prefix-forcing objective that enables more nuanced control over model behavior while being easy to optimize. Our objective leverages model-dependent prefixes, automatically selected based on two criteria: high prefilling attack success rates and low negative log-likelihood. It can further simplify optimization by using multiple prefixes for a single user request. AdvPrefix can integrate seamlessly into existing jailbreak attacks to improve their performance for free. For example, simply replacing GCG attack&#39;s target prefixes with ours on Llama-3 improves nuanced attack success rates from 14% to 80%, suggesting that current alignment struggles to generalize to unseen prefixes. Our work demonstrates the importance of jailbreak objectives in achieving nuanced jailbreaks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10321v1-abstract-full').style.display = 'none'; document.getElementById('2412.10321v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08378">arXiv:2412.08378</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.08378">pdf</a>, <a href="https://arxiv.org/format/2412.08378">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HyViLM: Enhancing Fine-Grained Recognition with a Hybrid Encoder for Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shiding Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+W">Wenhui Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+J">Jun Song</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yingbo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Y">Yanan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08378v2-abstract-short" style="display: inline;"> Recently, there has been growing interest in the capability of multimodal large language models (MLLMs) to process high-resolution images. A common approach currently involves dynamically cropping the original high-resolution image into smaller sub-images, which are then fed into a vision encoder that was pre-trained on lower-resolution images. However, this cropping approach often truncates objec&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08378v2-abstract-full').style.display = 'inline'; document.getElementById('2412.08378v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08378v2-abstract-full" style="display: none;"> Recently, there has been growing interest in the capability of multimodal large language models (MLLMs) to process high-resolution images. A common approach currently involves dynamically cropping the original high-resolution image into smaller sub-images, which are then fed into a vision encoder that was pre-trained on lower-resolution images. However, this cropping approach often truncates objects and connected areas in the original image, causing semantic breaks. To address this limitation, we introduce HyViLM, designed to process images of any resolution while retaining the overall context during encoding. Specifically, we: (i) Design a new visual encoder called Hybrid Encoder that not only encodes individual sub-images but also interacts with detailed global visual features, significantly improving the model&#39;s ability to encode high-resolution images. (ii) Propose an optimal feature fusion strategy for the dynamic cropping approach, effectively leveraging information from different layers of the vision encoder. Compared with the state-of-the-art MLLMs under the same setting, our HyViLM outperforms existing MLLMs in nine out of ten tasks. Specifically, HyViLM achieves a 9.6% improvement in performance on the TextVQA task and a 6.9% enhancement on the DocVQA task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08378v2-abstract-full').style.display = 'none'; document.getElementById('2412.08378v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07804">arXiv:2412.07804</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.07804">pdf</a>, <a href="https://arxiv.org/format/2412.07804">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> XLSTM-HVED: Cross-Modal Brain Tumor Segmentation and MRI Reconstruction Method Using Vision XLSTM and Heteromodal Variational Encoder-Decoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shenghao Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yifei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+S">Shuo Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Weihong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuanhan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ke%2C+Y">Yifan Ke</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+F">Feiwei Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Changmiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zhu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07804v2-abstract-short" style="display: inline;"> Neurogliomas are among the most aggressive forms of cancer, presenting considerable challenges in both treatment and monitoring due to their unpredictable biological behavior. Magnetic resonance imaging (MRI) is currently the preferred method for diagnosing and monitoring gliomas. However, the lack of specific imaging techniques often compromises the accuracy of tumor segmentation during the imagi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07804v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07804v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07804v2-abstract-full" style="display: none;"> Neurogliomas are among the most aggressive forms of cancer, presenting considerable challenges in both treatment and monitoring due to their unpredictable biological behavior. Magnetic resonance imaging (MRI) is currently the preferred method for diagnosing and monitoring gliomas. However, the lack of specific imaging techniques often compromises the accuracy of tumor segmentation during the imaging process. To address this issue, we introduce the XLSTM-HVED model. This model integrates a hetero-modal encoder-decoder framework with the Vision XLSTM module to reconstruct missing MRI modalities. By deeply fusing spatial and temporal features, it enhances tumor segmentation performance. The key innovation of our approach is the Self-Attention Variational Encoder (SAVE) module, which improves the integration of modal features. Additionally, it optimizes the interaction of features between segmentation and reconstruction tasks through the Squeeze-Fusion-Excitation Cross Awareness (SFECA) module. Our experiments using the BraTS 2024 dataset demonstrate that our model significantly outperforms existing advanced methods in handling cases where modalities are missing. Our source code is available at https://github.com/Quanato607/XLSTM-HVED. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07804v2-abstract-full').style.display = 'none'; document.getElementById('2412.07804v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ISBI 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07147">arXiv:2412.07147</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.07147">pdf</a>, <a href="https://arxiv.org/format/2412.07147">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MIT-10M: A Large Scale Parallel Corpus of Multilingual Image Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shaolin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+L">Lijie Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07147v2-abstract-short" style="display: inline;"> Image Translation (IT) holds immense potential across diverse domains, enabling the translation of textual content within images into various languages. However, existing datasets often suffer from limitations in scale, diversity, and quality, hindering the development and evaluation of IT models. To address this issue, we introduce MIT-10M, a large-scale parallel corpus of multilingual image tran&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07147v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07147v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07147v2-abstract-full" style="display: none;"> Image Translation (IT) holds immense potential across diverse domains, enabling the translation of textual content within images into various languages. However, existing datasets often suffer from limitations in scale, diversity, and quality, hindering the development and evaluation of IT models. To address this issue, we introduce MIT-10M, a large-scale parallel corpus of multilingual image translation with over 10M image-text pairs derived from real-world data, which has undergone extensive data cleaning and multilingual translation validation. It contains 840K images in three sizes, 28 categories, tasks with three levels of difficulty and 14 languages image-text pairs, which is a considerable improvement on existing datasets. We conduct extensive experiments to evaluate and train models on MIT-10M. The experimental results clearly indicate that our dataset has higher adaptability when it comes to evaluating the performance of the models in tackling challenging and complex image translation tasks in the real world. Moreover, the performance of the model fine-tuned with MIT-10M has tripled compared to the baseline model, further confirming its superiority. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07147v2-abstract-full').style.display = 'none'; document.getElementById('2412.07147v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in COLING 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06864">arXiv:2412.06864</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.06864">pdf</a>, <a href="https://arxiv.org/format/2412.06864">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Political-LLM: Large Language Models in Political Science </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Lincan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaqi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Catherine Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Gui%2C+F">Fred Gui</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hongjia Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Chenxiao Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhengguang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+J">Jianing Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J+A">Junlong Aaron Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+B">Bolin Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+A">Alex Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Weixin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+Z">Zhongkai Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+L">Lifang He</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hanjie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+K">Kaize Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+Z">Zijian Du</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%2C+F">Fangzhou Mu</a>, <a href="/search/cs?searchtype=author&amp;query=Pei%2C+J">Jiaxin Pei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Jieyu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Swayamdipta%2C+S">Swabha Swayamdipta</a>, <a href="/search/cs?searchtype=author&amp;query=Neiswanger%2C+W">Willie Neiswanger</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+H">Hua Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xiyang Hu</a> , et al. (22 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06864v1-abstract-short" style="display: inline;"> In recent years, large language models (LLMs) have been widely adopted in political science tasks such as election prediction, sentiment analysis, policy impact assessment, and misinformation detection. Meanwhile, the need to systematically understand how LLMs can further revolutionize the field also becomes urgent. In this work, we--a multidisciplinary team of researchers spanning computer scienc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06864v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06864v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06864v1-abstract-full" style="display: none;"> In recent years, large language models (LLMs) have been widely adopted in political science tasks such as election prediction, sentiment analysis, policy impact assessment, and misinformation detection. Meanwhile, the need to systematically understand how LLMs can further revolutionize the field also becomes urgent. In this work, we--a multidisciplinary team of researchers spanning computer science and political science--present the first principled framework termed Political-LLM to advance the comprehensive understanding of integrating LLMs into computational political science. Specifically, we first introduce a fundamental taxonomy classifying the existing explorations into two perspectives: political science and computational methodologies. In particular, from the political science perspective, we highlight the role of LLMs in automating predictive and generative tasks, simulating behavior dynamics, and improving causal inference through tools like counterfactual generation; from a computational perspective, we introduce advancements in data preparation, fine-tuning, and evaluation methods for LLMs that are tailored to political contexts. We identify key challenges and future directions, emphasizing the development of domain-specific datasets, addressing issues of bias and fairness, incorporating human expertise, and redefining evaluation criteria to align with the unique requirements of computational political science. Political-LLM seeks to serve as a guidebook for researchers to foster an informed, ethical, and impactful use of Artificial Intelligence in political science. Our online resource is available at: http://political-llm.org/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06864v1-abstract-full').style.display = 'none'; document.getElementById('2412.06864v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">54 Pages, 9 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05011">arXiv:2412.05011</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.05011">pdf</a>, <a href="https://arxiv.org/ps/2412.05011">ps</a>, <a href="https://arxiv.org/format/2412.05011">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Galois self-orthogonal MDS codes with large dimensions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+R">Ruhao Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shixin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05011v1-abstract-short" style="display: inline;"> Let $q=p^m$ be a prime power, $e$ be an integer with $0\leq e\leq m-1$ and $s=\gcd(e,m)$. In this paper, for a vector $v$ and a $q$-ary linear code $C$, we give some necessary and sufficient conditions for the equivalent code $vC$ of $C$ and the extended code of $vC$ to be $e$-Galois self-orthogonal. From this, we directly obtain some necessary and sufficient conditions for (extended) generalized&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05011v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05011v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05011v1-abstract-full" style="display: none;"> Let $q=p^m$ be a prime power, $e$ be an integer with $0\leq e\leq m-1$ and $s=\gcd(e,m)$. In this paper, for a vector $v$ and a $q$-ary linear code $C$, we give some necessary and sufficient conditions for the equivalent code $vC$ of $C$ and the extended code of $vC$ to be $e$-Galois self-orthogonal. From this, we directly obtain some necessary and sufficient conditions for (extended) generalized Reed-Solomon (GRS and EGRS) codes to be $e$-Galois self-orthogonal. Furthermore, for all possible $e$ satisfying $0\leq e\leq m-1$, we classify them into three cases (1) $\frac{m}{s}$ odd and $p$ even; (2) $\frac{m}{s}$ odd and $p$ odd; (3) $\frac{m}{s}$ even, and construct several new classes of $e$-Galois self-orthogonal maximum distance separable (MDS) codes. It is worth noting that our $e$-Galois self-orthogonal MDS codes can have dimensions greater than $\lfloor \frac{n+p^e-1}{p^e+1}\rfloor$, which are not covered by previously known ones. Moreover, by propagation rules, we obtain some new MDS codes with Galois hulls of arbitrary dimensions. As an application, many quantum codes can be obtained from these MDS codes with Galois hulls. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05011v1-abstract-full').style.display = 'none'; document.getElementById('2412.05011v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 2 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 94B05 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04141">arXiv:2412.04141</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.04141">pdf</a>, <a href="https://arxiv.org/format/2412.04141">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Reducing Tool Hallucination via Reliability Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hongshen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Su Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+H">Hang Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+D">Da Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+R">Ruisheng Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+S">Shuai Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+K">Kai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04141v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have extended their capabilities beyond language generation to interact with external systems through tool calling, offering powerful potential for real-world applications. However, the phenomenon of tool hallucinations, which occur when models improperly select or misuse tools, presents critical challenges that can lead to flawed task execution and increased operation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04141v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04141v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04141v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have extended their capabilities beyond language generation to interact with external systems through tool calling, offering powerful potential for real-world applications. However, the phenomenon of tool hallucinations, which occur when models improperly select or misuse tools, presents critical challenges that can lead to flawed task execution and increased operational costs. This paper investigates the concept of reliable tool calling and highlights the necessity of addressing tool hallucinations. We systematically categorize tool hallucinations into two main types: tool selection hallucination and tool usage hallucination. To mitigate these issues, we propose a reliability-focused alignment framework that enhances the model&#39;s ability to accurately assess tool relevance and usage. By proposing a suite of evaluation metrics and evaluating on StableToolBench, we further demonstrate the effectiveness of our framework in mitigating tool hallucination and improving the overall system reliability of LLM tool calling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04141v1-abstract-full').style.display = 'none'; document.getElementById('2412.04141v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03593">arXiv:2412.03593</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.03593">pdf</a>, <a href="https://arxiv.org/format/2412.03593">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CovidLLM: A Robust Large Language Model with Missing Value Adaptation and Multi-Objective Learning Strategy for Predicting Disease Severity and Clinical Outcomes in COVID-19 Patients </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shengjun Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Siyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+Q">Qing Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+H">Hongyan Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Hewei Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+S">Shujuan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Feng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+R">Rongshang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xionglin Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+S">Shengce Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+J">Jiaxin Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03593v1-abstract-short" style="display: inline;"> Coronavirus Disease 2019 (COVID-19), which emerged in 2019, has caused millions of deaths worldwide. Although effective vaccines have been developed to mitigate severe symptoms, certain populations, particularly the elderly and those with comorbidities, remain at high risk for severe outcomes and increased mortality. Consequently, early identification of the severity and clinical outcomes of the d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03593v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03593v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03593v1-abstract-full" style="display: none;"> Coronavirus Disease 2019 (COVID-19), which emerged in 2019, has caused millions of deaths worldwide. Although effective vaccines have been developed to mitigate severe symptoms, certain populations, particularly the elderly and those with comorbidities, remain at high risk for severe outcomes and increased mortality. Consequently, early identification of the severity and clinical outcomes of the disease in these patients is vital to prevent adverse prognoses. Although traditional machine learning and deep learning models have been widely employed in this area, the potential of large language models (LLMs) remains largely unexplored. Our research focuses primarily on constructing specialized prompts and adopting multi-objective learning strategies. We started by selecting serological indicators that significantly correlate with clinical outcomes and disease severity to serve as input data for the model. Blood test samples often contain numerous missing values, and traditional models generally rely on imputation to handle these gaps in the data. In contrast, LLMs offer the advantage of robust semantic understanding. By setting prompts, we can explicitly inform the model when a feature&#39;s value is missing, without the need for imputation. For the multi-objective learning strategy, the model is designed to first predict disease severity and then predict clinical outcomes. Given that LLMs utilize both the input text and the generated tokens as input for generating the next token, the predicted severity is used as a basis for generating the clinical outcome. During the fine-tuning of the LLM, the two objectives influence and improve each other. Our experiments were implemented based on the ChatGLM model. The results demonstrate the effectiveness of LLMs in this task, suggesting promising potential for further development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03593v1-abstract-full').style.display = 'none'; document.getElementById('2412.03593v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02684">arXiv:2412.02684</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.02684">pdf</a>, <a href="https://arxiv.org/format/2412.02684">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AniGS: Animatable Gaussian Avatar from a Single Image with Inconsistent Gaussian Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+L">Lingteng Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shenhao Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zuo%2C+Q">Qi Zuo</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+X">Xiaodong Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuan Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Junfei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+W">Weihao Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Bo%2C+L">Liefeng Bo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guanying Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zilong Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02684v1-abstract-short" style="display: inline;"> Generating animatable human avatars from a single image is essential for various digital human modeling applications. Existing 3D reconstruction methods often struggle to capture fine details in animatable models, while generative approaches for controllable animation, though avoiding explicit 3D modeling, suffer from viewpoint inconsistencies in extreme poses and computational inefficiencies. In&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02684v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02684v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02684v1-abstract-full" style="display: none;"> Generating animatable human avatars from a single image is essential for various digital human modeling applications. Existing 3D reconstruction methods often struggle to capture fine details in animatable models, while generative approaches for controllable animation, though avoiding explicit 3D modeling, suffer from viewpoint inconsistencies in extreme poses and computational inefficiencies. In this paper, we address these challenges by leveraging the power of generative models to produce detailed multi-view canonical pose images, which help resolve ambiguities in animatable human reconstruction. We then propose a robust method for 3D reconstruction of inconsistent images, enabling real-time rendering during inference. Specifically, we adapt a transformer-based video generation model to generate multi-view canonical pose images and normal maps, pretraining on a large-scale video dataset to improve generalization. To handle view inconsistencies, we recast the reconstruction problem as a 4D task and introduce an efficient 3D modeling approach using 4D Gaussian Splatting. Experiments demonstrate that our method achieves photorealistic, real-time animation of 3D human avatars from in-the-wild images, showcasing its effectiveness and generalization capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02684v1-abstract-full').style.display = 'none'; document.getElementById('2412.02684v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://lingtengqiu.github.io/2024/AniGS/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02252">arXiv:2412.02252</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.02252">pdf</a>, <a href="https://arxiv.org/format/2412.02252">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Compressing KV Cache for Long-Context LLM Inference with Inter-Layer Attention Similarity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+D">Da Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Situo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Y">Yuxun Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Su Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hongshen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hanqi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+S">Shuai Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+L">Lei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+K">Kai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02252v1-abstract-short" style="display: inline;"> The increasing context window size in Large Language Models (LLMs), such as the GPT and LLaMA series, has improved their ability to tackle complex, long-text tasks, but at the cost of inference efficiency, particularly regarding memory and computational complexity. Existing methods, including selective token retention and window-based attention, improve efficiency but risk discarding important tok&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02252v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02252v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02252v1-abstract-full" style="display: none;"> The increasing context window size in Large Language Models (LLMs), such as the GPT and LLaMA series, has improved their ability to tackle complex, long-text tasks, but at the cost of inference efficiency, particularly regarding memory and computational complexity. Existing methods, including selective token retention and window-based attention, improve efficiency but risk discarding important tokens needed for future text generation. In this paper, we propose an approach that enhances LLM efficiency without token loss by reducing the memory and computational load of less important tokens, rather than discarding them.We address two challenges: 1) investigating the distribution of important tokens in the context, discovering recent tokens are more important than distant tokens in context, and 2) optimizing resources for distant tokens by sharing attention scores across layers. The experiments show that our method saves $35\%$ KV cache without compromising the performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02252v1-abstract-full').style.display = 'none'; document.getElementById('2412.02252v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01523">arXiv:2412.01523</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.01523">pdf</a>, <a href="https://arxiv.org/format/2412.01523">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FlexSP: Accelerating Large Language Model Training via Flexible Sequence Parallelism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yujie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shiju Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shenhan Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+F">Fangcheng Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xinyi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+X">Xuefeng Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Huixia Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiashi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+F">Faming Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+B">Bin Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01523v3-abstract-short" style="display: inline;"> Extending the context length (i.e., the maximum supported sequence length) of LLMs is of paramount significance. To facilitate long context training of LLMs, sequence parallelism has emerged as an essential technique, which scatters each input sequence across multiple devices and necessitates communication to process the sequence. In essence, existing sequence parallelism methods assume homogeneou&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01523v3-abstract-full').style.display = 'inline'; document.getElementById('2412.01523v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01523v3-abstract-full" style="display: none;"> Extending the context length (i.e., the maximum supported sequence length) of LLMs is of paramount significance. To facilitate long context training of LLMs, sequence parallelism has emerged as an essential technique, which scatters each input sequence across multiple devices and necessitates communication to process the sequence. In essence, existing sequence parallelism methods assume homogeneous sequence lengths (i.e., all input sequences are equal in length) and therefore leverages a single, static scattering strategy for all input sequences. However, in reality, the sequence lengths in LLM training corpora exhibit substantial variability, often following a long-tail distribution, which leads to workload heterogeneity. In this paper, we show that employing a single, static strategy results in inefficiency and resource under-utilization, highlighting the need for adaptive approaches to handle the heterogeneous workloads across sequences. To address this, we propose a heterogeneity-adaptive sequence parallelism method. For each training step, our approach captures the variability in sequence lengths and assigns the optimal combination of scattering strategies based on workload characteristics. We model this problem as a linear programming optimization and design an efficient and effective solver to find the optimal solution. Furthermore, we implement our method in a high-performance system that supports adaptive parallelization in distributed LLM training. Experimental results demonstrate that our system outperforms state-of-the-art training frameworks by up to 1.98x. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01523v3-abstract-full').style.display = 'none'; document.getElementById('2412.01523v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00765">arXiv:2412.00765</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.00765">pdf</a>, <a href="https://arxiv.org/format/2412.00765">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SelfPrompt: Autonomously Evaluating LLM Robustness via Domain-Constrained Knowledge Guidelines and Refined Adversarial Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pei%2C+A">Aihua Pei</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zehua Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shunan Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+R">Ruoxi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+J">Ju Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00765v1-abstract-short" style="display: inline;"> Traditional methods for evaluating the robustness of large language models (LLMs) often rely on standardized benchmarks, which can escalate costs and limit evaluations across varied domains. This paper introduces a novel framework designed to autonomously evaluate the robustness of LLMs by incorporating refined adversarial prompts and domain-constrained knowledge guidelines in the form of knowledg&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00765v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00765v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00765v1-abstract-full" style="display: none;"> Traditional methods for evaluating the robustness of large language models (LLMs) often rely on standardized benchmarks, which can escalate costs and limit evaluations across varied domains. This paper introduces a novel framework designed to autonomously evaluate the robustness of LLMs by incorporating refined adversarial prompts and domain-constrained knowledge guidelines in the form of knowledge graphs. Our method systematically generates descriptive sentences from domain-constrained knowledge graph triplets to formulate adversarial prompts, enhancing the relevance and challenge of the evaluation. These prompts, generated by the LLM itself and tailored to evaluate its own robustness, undergo a rigorous filtering and refinement process, ensuring that only those with high textual fluency and semantic fidelity are used. This self-evaluation mechanism allows the LLM to evaluate its robustness without the need for external benchmarks. We assess the effectiveness of our framework through extensive testing on both proprietary models like ChatGPT and open-source models such as Llama-3.1, Phi-3, and Mistral. Results confirm that our approach not only reduces dependency on conventional data but also provides a targeted and efficient means of evaluating LLM robustness in constrained domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00765v1-abstract-full').style.display = 'none'; document.getElementById('2412.00765v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00733">arXiv:2412.00733</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.00733">pdf</a>, <a href="https://arxiv.org/format/2412.00733">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Hallo3: Highly Dynamic and Realistic Portrait Image Animation with Diffusion Transformer Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cui%2C+J">Jiahao Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+Y">Yun Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Shang%2C+H">Hanlin Shang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+K">Kaihui Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y">Yuqi Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%2C+S">Shan Mu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Hang Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Siyu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00733v3-abstract-short" style="display: inline;"> Existing methodologies for animating portrait images face significant challenges, particularly in handling non-frontal perspectives, rendering dynamic objects around the portrait, and generating immersive, realistic backgrounds. In this paper, we introduce the first application of a pretrained transformer-based video generative model that demonstrates strong generalization capabilities and generat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00733v3-abstract-full').style.display = 'inline'; document.getElementById('2412.00733v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00733v3-abstract-full" style="display: none;"> Existing methodologies for animating portrait images face significant challenges, particularly in handling non-frontal perspectives, rendering dynamic objects around the portrait, and generating immersive, realistic backgrounds. In this paper, we introduce the first application of a pretrained transformer-based video generative model that demonstrates strong generalization capabilities and generates highly dynamic, realistic videos for portrait animation, effectively addressing these challenges. The adoption of a new video backbone model makes previous U-Net-based methods for identity maintenance, audio conditioning, and video extrapolation inapplicable. To address this limitation, we design an identity reference network consisting of a causal 3D VAE combined with a stacked series of transformer layers, ensuring consistent facial identity across video sequences. Additionally, we investigate various speech audio conditioning and motion frame mechanisms to enable the generation of continuous video driven by speech audio. Our method is validated through experiments on benchmark and newly proposed wild datasets, demonstrating substantial improvements over prior methods in generating realistic portraits characterized by diverse orientations within dynamic and immersive scenes. Further visualizations and the source code are available at: https://fudan-generative-vision.github.io/hallo3/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00733v3-abstract-full').style.display = 'none'; document.getElementById('2412.00733v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00115">arXiv:2412.00115</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.00115">pdf</a>, <a href="https://arxiv.org/format/2412.00115">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OpenHumanVid: A Large-Scale High-Quality Dataset for Enhancing Human-Centric Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Mingwang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+Y">Yun Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%2C+S">Shan Mu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaye Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+K">Kaihui Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Siyu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00115v3-abstract-short" style="display: inline;"> Recent advancements in visual generation technologies have markedly increased the scale and availability of video datasets, which are crucial for training effective video generation models. However, a significant lack of high-quality, human-centric video datasets presents a challenge to progress in this field. To bridge this gap, we introduce OpenHumanVid, a large-scale and high-quality human-cent&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00115v3-abstract-full').style.display = 'inline'; document.getElementById('2412.00115v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00115v3-abstract-full" style="display: none;"> Recent advancements in visual generation technologies have markedly increased the scale and availability of video datasets, which are crucial for training effective video generation models. However, a significant lack of high-quality, human-centric video datasets presents a challenge to progress in this field. To bridge this gap, we introduce OpenHumanVid, a large-scale and high-quality human-centric video dataset characterized by precise and detailed captions that encompass both human appearance and motion states, along with supplementary human motion conditions, including skeleton sequences and speech audio. To validate the efficacy of this dataset and the associated training strategies, we propose an extension of existing classical diffusion transformer architectures and conduct further pretraining of our models on the proposed dataset. Our findings yield two critical insights: First, the incorporation of a large-scale, high-quality dataset substantially enhances evaluation metrics for generated human videos while preserving performance in general video generation tasks. Second, the effective alignment of text with human appearance, human motion, and facial motion is essential for producing high-quality video outputs. Based on these insights and corresponding methodologies, the straightforward extended network trained on the proposed dataset demonstrates an obvious improvement in the generation of human-centric videos. Project page https://fudan-generative-vision.github.io/OpenHumanVid <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00115v3-abstract-full').style.display = 'none'; document.getElementById('2412.00115v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 8 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19545">arXiv:2411.19545</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.19545">pdf</a>, <a href="https://arxiv.org/format/2411.19545">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Unified Interaction Control Framework for Safe Robotic Ultrasound Scanning with Human-Intention-Aware Compliance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yan%2C+X">Xiangjie Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+S">Shaqi Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yongpeng Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+M">Mingrui Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Senqiang Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+G">Gao Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+S">Shiji Song</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19545v1-abstract-short" style="display: inline;"> The ultrasound scanning robot operates in environments where frequent human-robot interactions occur. Most existing control methods for ultrasound scanning address only one specific interaction situation or implement hard switches between controllers for different situations, which compromises both safety and efficiency. In this paper, we propose a unified interaction control framework for ultraso&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19545v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19545v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19545v1-abstract-full" style="display: none;"> The ultrasound scanning robot operates in environments where frequent human-robot interactions occur. Most existing control methods for ultrasound scanning address only one specific interaction situation or implement hard switches between controllers for different situations, which compromises both safety and efficiency. In this paper, we propose a unified interaction control framework for ultrasound scanning robots capable of handling all common interactions, distinguishing both human-intended and unintended types, and adapting with appropriate compliance. Specifically, the robot suspends or modulates its ongoing main task if the interaction is intended, e.g., when the doctor grasps the robot to lead the end effector actively. Furthermore, it can identify unintended interactions and avoid potential collision in the null space beforehand. Even if that collision has happened, it can become compliant with the collision in the null space and try to reduce its impact on the main task (where the scan is ongoing) kinematically and dynamically. The multiple situations are integrated into a unified controller with a smooth transition to deal with the interactions by exhibiting human-intention-aware compliance. Experimental results validate the framework&#39;s ability to cope with all common interactions including intended intervention and unintended collision in a collaborative carotid artery ultrasound scanning task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19545v1-abstract-full').style.display = 'none'; document.getElementById('2411.19545v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18021">arXiv:2411.18021</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18021">pdf</a>, <a href="https://arxiv.org/format/2411.18021">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can bidirectional encoder become the ultimate winner for downstream applications of foundation models? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Lewen Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xuanyu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+J">Juao Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xinyi Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Shengxin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18021v1-abstract-short" style="display: inline;"> Over the past few decades, Artificial Intelligence(AI) has progressed from the initial machine learning stage to the deep learning stage, and now to the stage of foundational models. Foundational models have the characteristics of pre-training, transfer learning, and self-supervised learning, and pre-trained models can be fine-tuned and applied to various downstream tasks. Under the framework of f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18021v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18021v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18021v1-abstract-full" style="display: none;"> Over the past few decades, Artificial Intelligence(AI) has progressed from the initial machine learning stage to the deep learning stage, and now to the stage of foundational models. Foundational models have the characteristics of pre-training, transfer learning, and self-supervised learning, and pre-trained models can be fine-tuned and applied to various downstream tasks. Under the framework of foundational models, models such as Bidirectional Encoder Representations from Transformers(BERT) and Generative Pre-trained Transformer(GPT) have greatly advanced the development of natural language processing(NLP), especially the emergence of many models based on BERT. BERT broke through the limitation of only using one-way methods for language modeling in pre-training by using a masked language model. It can capture bidirectional context information to predict the masked words in the sequence, this can improve the feature extraction ability of the model. This makes the model very useful for downstream tasks, especially for specialized applications. The model using the bidirectional encoder can better understand the domain knowledge and be better applied to these downstream tasks. So we hope to help understand how this technology has evolved and improved model performance in various natural language processing tasks under the background of foundational models and reveal its importance in capturing context information and improving the model&#39;s performance on downstream tasks. This article analyzes one-way and bidirectional models based on GPT and BERT and compares their differences based on the purpose of the model. It also briefly analyzes BERT and the improvements of some models based on BERT. The model&#39;s performance on the Stanford Question Answering Dataset(SQuAD) and General Language Understanding Evaluation(GLUE) was compared. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18021v1-abstract-full').style.display = 'none'; document.getElementById('2411.18021v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures, FLLM2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15351">arXiv:2411.15351</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15351">pdf</a>, <a href="https://arxiv.org/format/2411.15351">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Accelerating CALPHAD-based Phase Diagram Predictions in Complex Alloys Using Universal Machine Learning Potentials: Opportunities and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Siya Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Arr%C3%B3yave%2C+R">Raymundo Arr贸yave</a>, <a href="/search/cs?searchtype=author&amp;query=Sar%C4%B1t%C3%BCrk%2C+D">Do臒uhan Sar谋t眉rk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15351v1-abstract-short" style="display: inline;"> Accurate phase diagram prediction is crucial for understanding alloy thermodynamics and advancing materials design. While traditional CALPHAD methods are robust, they are resource-intensive and limited by experimentally assessed data. This work explores the use of machine learning interatomic potentials (MLIPs) such as M3GNet, CHGNet, MACE, SevenNet, and ORB to significantly accelerate phase diagr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15351v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15351v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15351v1-abstract-full" style="display: none;"> Accurate phase diagram prediction is crucial for understanding alloy thermodynamics and advancing materials design. While traditional CALPHAD methods are robust, they are resource-intensive and limited by experimentally assessed data. This work explores the use of machine learning interatomic potentials (MLIPs) such as M3GNet, CHGNet, MACE, SevenNet, and ORB to significantly accelerate phase diagram calculations by using the Alloy Theoretic Automated Toolkit (ATAT) to map calculations of the energies and free energies of atomistic systems to CALPHAD-compatible thermodynamic descriptions. Using case studies including Cr-Mo, Cu-Au, and Pt-W, we demonstrate that MLIPs, particularly ORB, achieve computational speedups exceeding three orders of magnitude compared to DFT while maintaining phase stability predictions within acceptable accuracy. Extending this approach to liquid phases and ternary systems like Cr-Mo-V highlights its versatility for high-entropy alloys and complex chemical spaces. This work demonstrates that MLIPs, integrated with tools like ATAT within a CALPHAD framework, provide an efficient and accurate framework for high-throughput thermodynamic modeling, enabling rapid exploration of novel alloy systems. While many challenges remain to be addressed, the accuracy of some of these MLIPs (ORB in particular) are on the verge of paving the way toward high-throughput generation of CALPHAD thermodynamic descriptions of multi-component, multi-phase alloy systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15351v1-abstract-full').style.display = 'none'; document.getElementById('2411.15351v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15221">arXiv:2411.15221</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15221">pdf</a>, <a href="https://arxiv.org/format/2411.15221">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> </div> </div> <p class="title is-5 mathjax"> Reflections from the 2024 Large Language Model (LLM) Hackathon for Applications in Materials Science and Chemistry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zimmermann%2C+Y">Yoel Zimmermann</a>, <a href="/search/cs?searchtype=author&amp;query=Bazgir%2C+A">Adib Bazgir</a>, <a href="/search/cs?searchtype=author&amp;query=Afzal%2C+Z">Zartashia Afzal</a>, <a href="/search/cs?searchtype=author&amp;query=Agbere%2C+F">Fariha Agbere</a>, <a href="/search/cs?searchtype=author&amp;query=Ai%2C+Q">Qianxiang Ai</a>, <a href="/search/cs?searchtype=author&amp;query=Alampara%2C+N">Nawaf Alampara</a>, <a href="/search/cs?searchtype=author&amp;query=Al-Feghali%2C+A">Alexander Al-Feghali</a>, <a href="/search/cs?searchtype=author&amp;query=Ansari%2C+M">Mehrad Ansari</a>, <a href="/search/cs?searchtype=author&amp;query=Antypov%2C+D">Dmytro Antypov</a>, <a href="/search/cs?searchtype=author&amp;query=Aswad%2C+A">Amro Aswad</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+J">Jiaru Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Baibakova%2C+V">Viktoriia Baibakova</a>, <a href="/search/cs?searchtype=author&amp;query=Biswajeet%2C+D+D">Devi Dutta Biswajeet</a>, <a href="/search/cs?searchtype=author&amp;query=Bitzek%2C+E">Erik Bitzek</a>, <a href="/search/cs?searchtype=author&amp;query=Bocarsly%2C+J+D">Joshua D. Bocarsly</a>, <a href="/search/cs?searchtype=author&amp;query=Borisova%2C+A">Anna Borisova</a>, <a href="/search/cs?searchtype=author&amp;query=Bran%2C+A+M">Andres M Bran</a>, <a href="/search/cs?searchtype=author&amp;query=Brinson%2C+L+C">L. Catherine Brinson</a>, <a href="/search/cs?searchtype=author&amp;query=Calderon%2C+M+M">Marcel Moran Calderon</a>, <a href="/search/cs?searchtype=author&amp;query=Canalicchio%2C+A">Alessandro Canalicchio</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+V">Victor Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chiang%2C+Y">Yuan Chiang</a>, <a href="/search/cs?searchtype=author&amp;query=Circi%2C+D">Defne Circi</a>, <a href="/search/cs?searchtype=author&amp;query=Charmes%2C+B">Benjamin Charmes</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhary%2C+V">Vikrant Chaudhary</a> , et al. (119 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15221v2-abstract-short" style="display: inline;"> Here, we present the outcomes from the second Large Language Model (LLM) Hackathon for Applications in Materials Science and Chemistry, which engaged participants across global hybrid locations, resulting in 34 team submissions. The submissions spanned seven key application areas and demonstrated the diverse utility of LLMs for applications in (1) molecular and material property prediction; (2) mo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15221v2-abstract-full').style.display = 'inline'; document.getElementById('2411.15221v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15221v2-abstract-full" style="display: none;"> Here, we present the outcomes from the second Large Language Model (LLM) Hackathon for Applications in Materials Science and Chemistry, which engaged participants across global hybrid locations, resulting in 34 team submissions. The submissions spanned seven key application areas and demonstrated the diverse utility of LLMs for applications in (1) molecular and material property prediction; (2) molecular and material design; (3) automation and novel interfaces; (4) scientific communication and education; (5) research data management and automation; (6) hypothesis generation and evaluation; and (7) knowledge extraction and reasoning from scientific literature. Each team submission is presented in a summary table with links to the code and as brief papers in the appendix. Beyond team results, we discuss the hackathon event and its hybrid format, which included physical hubs in Toronto, Montreal, San Francisco, Berlin, Lausanne, and Tokyo, alongside a global online hub to enable local and virtual collaboration. Overall, the event highlighted significant improvements in LLM capabilities since the previous year&#39;s hackathon, suggesting continued expansion of LLMs for applications in materials science and chemistry research. These outcomes demonstrate the dual utility of LLMs as both multipurpose models for diverse machine learning tasks and platforms for rapid prototyping custom applications in scientific research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15221v2-abstract-full').style.display = 'none'; document.getElementById('2411.15221v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updating author information, the submission remains largely unchanged. 98 pages total</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhu%2C+S&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10