CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 265 results for author: <span class="mathjax">Shao, Z</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Shao%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Shao, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Shao%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Shao, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08150">arXiv:2502.08150</a> <span> [<a href="https://arxiv.org/pdf/2502.08150">pdf</a>, <a href="https://arxiv.org/format/2502.08150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Force Matching with Relativistic Constraints: A Physics-Inspired Approach to Stable and Efficient Generative Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+M">Mingda Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08150v1-abstract-short" style="display: inline;"> This paper introduces Force Matching (ForM), a novel framework for generative modeling that represents an initial exploration into leveraging special relativistic mechanics to enhance the stability of the sampling process. By incorporating the Lorentz factor, ForM imposes a velocity constraint, ensuring that sample velocities remain bounded within a constant limit. This constraint serves as a fund… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08150v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08150v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08150v1-abstract-full" style="display: none;"> This paper introduces Force Matching (ForM), a novel framework for generative modeling that represents an initial exploration into leveraging special relativistic mechanics to enhance the stability of the sampling process. By incorporating the Lorentz factor, ForM imposes a velocity constraint, ensuring that sample velocities remain bounded within a constant limit. This constraint serves as a fundamental mechanism for stabilizing the generative dynamics, leading to a more robust and controlled sampling process. We provide a rigorous theoretical analysis demonstrating that the velocity constraint is preserved throughout the sampling procedure within the ForM framework. To validate the effectiveness of our approach, we conduct extensive empirical evaluations. On the \textit{half-moons} dataset, ForM significantly outperforms baseline methods, achieving the lowest Euclidean distance loss of \textbf{0.714}, in contrast to vanilla first-order flow matching (5.853) and first- and second-order flow matching (5.793). Additionally, we perform an ablation study to further investigate the impact of our velocity constraint, reaffirming the superiority of ForM in stabilizing the generative process. The theoretical guarantees and empirical results underscore the potential of integrating special relativity principles into generative modeling. Our findings suggest that ForM provides a promising pathway toward achieving stable, efficient, and flexible generative processes. This work lays the foundation for future advancements in high-dimensional generative modeling, opening new avenues for the application of physical principles in machine learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08150v1-abstract-full').style.display = 'none'; document.getElementById('2502.08150v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00859">arXiv:2502.00859</a> <span> [<a href="https://arxiv.org/pdf/2502.00859">pdf</a>, <a href="https://arxiv.org/format/2502.00859">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FedRIR: Rethinking Information Representation in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zerui Shao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zexin Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00859v1-abstract-short" style="display: inline;"> Mobile and Web-of-Things (WoT) devices at the network edge generate vast amounts of data for machine learning applications, yet privacy concerns hinder centralized model training. Federated Learning (FL) allows clients (devices) to collaboratively train a shared model coordinated by a central server without transfer private data, but inherent statistical heterogeneity among clients presents challe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00859v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00859v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00859v1-abstract-full" style="display: none;"> Mobile and Web-of-Things (WoT) devices at the network edge generate vast amounts of data for machine learning applications, yet privacy concerns hinder centralized model training. Federated Learning (FL) allows clients (devices) to collaboratively train a shared model coordinated by a central server without transfer private data, but inherent statistical heterogeneity among clients presents challenges, often leading to a dilemma between clients' needs for personalized local models and the server's goal of building a generalized global model. Existing FL methods typically prioritize either global generalization or local personalization, resulting in a trade-off between these two objectives and limiting the full potential of diverse client data. To address this challenge, we propose a novel framework that simultaneously enhances global generalization and local personalization by Rethinking Information Representation in the Federated learning process (FedRIR). Specifically, we introduce Masked Client-Specific Learning (MCSL), which isolates and extracts fine-grained client-specific features tailored to each client's unique data characteristics, thereby enhancing personalization. Concurrently, the Information Distillation Module (IDM) refines the global shared features by filtering out redundant client-specific information, resulting in a purer and more robust global representation that enhances generalization. By integrating the refined global features with the isolated client-specific features, we construct enriched representations that effectively capture both global patterns and local nuances, thereby improving the performance of downstream tasks on the client. The code is available at https://github.com/Deep-Imaging-Group/FedRIR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00859v1-abstract-full').style.display = 'none'; document.getElementById('2502.00859v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00693">arXiv:2502.00693</a> <span> [<a href="https://arxiv.org/pdf/2502.00693">pdf</a>, <a href="https://arxiv.org/format/2502.00693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> DPBloomfilter: Securing Bloom Filters with Differential Privacy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ke%2C+Y">Yekun Ke</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00693v1-abstract-short" style="display: inline;"> The Bloom filter is a simple yet space-efficient probabilistic data structure that supports membership queries for dramatically large datasets. It is widely utilized and implemented across various industrial scenarios, often handling massive datasets that include sensitive user information necessitating privacy preservation. To address the challenge of maintaining privacy within the Bloom filter,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00693v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00693v1-abstract-full" style="display: none;"> The Bloom filter is a simple yet space-efficient probabilistic data structure that supports membership queries for dramatically large datasets. It is widely utilized and implemented across various industrial scenarios, often handling massive datasets that include sensitive user information necessitating privacy preservation. To address the challenge of maintaining privacy within the Bloom filter, we have developed the DPBloomfilter. This innovation integrates the classical differential privacy mechanism, specifically the Random Response technique, into the Bloom filter, offering robust privacy guarantees under the same running complexity as the standard Bloom filter. Through rigorous simulation experiments, we have demonstrated that our DPBloomfilter algorithm maintains high utility while ensuring privacy protections. To the best of our knowledge, this is the first work to provide differential privacy guarantees for the Bloom filter for membership query problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00693v1-abstract-full').style.display = 'none'; document.getElementById('2502.00693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00690">arXiv:2502.00690</a> <span> [<a href="https://arxiv.org/pdf/2502.00690">pdf</a>, <a href="https://arxiv.org/format/2502.00690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> </div> </div> <p class="title is-5 mathjax"> Dissecting Submission Limit in Desk-Rejections: A Mathematical Analysis of Fairness in AI Conference Policies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuefan Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiahao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00690v1-abstract-short" style="display: inline;"> As AI research surges in both impact and volume, conferences have imposed submission limits to maintain paper quality and alleviate organizational pressure. In this work, we examine the fairness of desk-rejection systems under submission limits and reveal that existing practices can result in substantial inequities. Specifically, we formally define the paper submission limit problem and identify a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00690v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00690v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00690v1-abstract-full" style="display: none;"> As AI research surges in both impact and volume, conferences have imposed submission limits to maintain paper quality and alleviate organizational pressure. In this work, we examine the fairness of desk-rejection systems under submission limits and reveal that existing practices can result in substantial inequities. Specifically, we formally define the paper submission limit problem and identify a critical dilemma: when the number of authors exceeds three, it becomes impossible to reject papers solely based on excessive submissions without negatively impacting innocent authors. Thus, this issue may unfairly affect early-career researchers, as their submissions may be penalized due to co-authors with significantly higher submission counts, while senior researchers with numerous papers face minimal consequences. To address this, we propose an optimization-based fairness-aware desk-rejection mechanism and formally define two fairness metrics: individual fairness and group fairness. We prove that optimizing individual fairness is NP-hard, whereas group fairness can be efficiently optimized via linear programming. Through case studies, we demonstrate that our proposed system ensures greater equity than existing methods, including those used in CVPR 2025, offering a more socially just approach to managing excessive submissions in AI conferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00690v1-abstract-full').style.display = 'none'; document.getElementById('2502.00690v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00688">arXiv:2502.00688</a> <span> [<a href="https://arxiv.org/pdf/2502.00688">pdf</a>, <a href="https://arxiv.org/format/2502.00688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> High-Order Matching for One-Step Shortcut Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chengyue Gong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+M">Mingda Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00688v1-abstract-short" style="display: inline;"> One-step shortcut diffusion models [Frans, Hafner, Levine and Abbeel, ICLR 2025] have shown potential in vision generation, but their reliance on first-order trajectory supervision is fundamentally limited. The Shortcut model's simplistic velocity-only approach fails to capture intrinsic manifold geometry, leading to erratic trajectories, poor geometric alignment, and instability-especially in hig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00688v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00688v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00688v1-abstract-full" style="display: none;"> One-step shortcut diffusion models [Frans, Hafner, Levine and Abbeel, ICLR 2025] have shown potential in vision generation, but their reliance on first-order trajectory supervision is fundamentally limited. The Shortcut model's simplistic velocity-only approach fails to capture intrinsic manifold geometry, leading to erratic trajectories, poor geometric alignment, and instability-especially in high-curvature regions. These shortcomings stem from its inability to model mid-horizon dependencies or complex distributional features, leaving it ill-equipped for robust generative modeling. In this work, we introduce HOMO (High-Order Matching for One-Step Shortcut Diffusion), a game-changing framework that leverages high-order supervision to revolutionize distribution transportation. By incorporating acceleration, jerk, and beyond, HOMO not only fixes the flaws of the Shortcut model but also achieves unprecedented smoothness, stability, and geometric precision. Theoretically, we prove that HOMO's high-order supervision ensures superior approximation accuracy, outperforming first-order methods. Empirically, HOMO dominates in complex settings, particularly in high-curvature regions where the Shortcut model struggles. Our experiments show that HOMO delivers smoother trajectories and better distributional alignment, setting a new standard for one-step generative models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00688v1-abstract-full').style.display = 'none'; document.getElementById('2502.00688v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12948">arXiv:2501.12948</a> <span> [<a href="https://arxiv.org/pdf/2501.12948">pdf</a>, <a href="https://arxiv.org/format/2501.12948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Junxiao Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shirong Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+X">Xiao Bi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xingkai Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z+F">Z. F. Wu</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+Z">Zhibin Gou</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhihong Shao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoshu Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ziyi Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12948v1-abstract-short" style="display: inline;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12948v1-abstract-full" style="display: none;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'none'; document.getElementById('2501.12948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09982">arXiv:2501.09982</a> <span> [<a href="https://arxiv.org/pdf/2501.09982">pdf</a>, <a href="https://arxiv.org/format/2501.09982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RichSpace: Enriching Text-to-Video Prompt Space via Text Embedding Interpolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuefan Cao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chengyue Gong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09982v2-abstract-short" style="display: inline;"> Text-to-video generation models have made impressive progress, but they still struggle with generating videos with complex features. This limitation often arises from the inability of the text encoder to produce accurate embeddings, which hinders the video generation model. In this work, we propose a novel approach to overcome this challenge by selecting the optimal text embedding through interpol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09982v2-abstract-full').style.display = 'inline'; document.getElementById('2501.09982v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09982v2-abstract-full" style="display: none;"> Text-to-video generation models have made impressive progress, but they still struggle with generating videos with complex features. This limitation often arises from the inability of the text encoder to produce accurate embeddings, which hinders the video generation model. In this work, we propose a novel approach to overcome this challenge by selecting the optimal text embedding through interpolation in the embedding space. We demonstrate that this method enables the video generation model to produce the desired videos. Additionally, we introduce a simple algorithm using perpendicular foot embeddings and cosine similarity to identify the optimal interpolation embedding. Our findings highlight the importance of accurate text embeddings and offer a pathway for improving text-to-video generation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09982v2-abstract-full').style.display = 'none'; document.getElementById('2501.09982v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09734">arXiv:2501.09734</a> <span> [<a href="https://arxiv.org/pdf/2501.09734">pdf</a>, <a href="https://arxiv.org/format/2501.09734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Random Subspace Cubic-Regularization Methods, with Applications to Low-Rank Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cartis%2C+C">Coralia Cartis</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhen Shao</a>, <a href="/search/cs?searchtype=author&query=Tansley%2C+E">Edward Tansley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09734v1-abstract-short" style="display: inline;"> We propose and analyze random subspace variants of the second-order Adaptive Regularization using Cubics (ARC) algorithm. These methods iteratively restrict the search space to some random subspace of the parameters, constructing and minimizing a local model only within this subspace. Thus, our variants only require access to (small-dimensional) projections of first- and second-order problem deriv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09734v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09734v1-abstract-full" style="display: none;"> We propose and analyze random subspace variants of the second-order Adaptive Regularization using Cubics (ARC) algorithm. These methods iteratively restrict the search space to some random subspace of the parameters, constructing and minimizing a local model only within this subspace. Thus, our variants only require access to (small-dimensional) projections of first- and second-order problem derivatives and calculate a reduced step inexpensively. Under suitable assumptions, the ensuing methods maintain the optimal first-order, and second-order, global rates of convergence of (full-dimensional) cubic regularization, while showing improved scalability both theoretically and numerically, particularly when applied to low-rank functions. When applied to the latter, our adaptive variant naturally adapts the subspace size to the true rank of the function, without knowing it a priori. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09734v1-abstract-full').style.display = 'none'; document.getElementById('2501.09734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09349">arXiv:2501.09349</a> <span> [<a href="https://arxiv.org/pdf/2501.09349">pdf</a>, <a href="https://arxiv.org/format/2501.09349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> ChartInsighter: An Approach for Mitigating Hallucination in Time-series Chart Summary Generation with A Benchmark Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bomiao Wang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xueli Shu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhen Liu</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zekai Shao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chao Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siming Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09349v1-abstract-short" style="display: inline;"> Effective chart summary can significantly reduce the time and effort decision makers spend interpreting charts, enabling precise and efficient communication of data insights. Previous studies have faced challenges in generating accurate and semantically rich summaries of time-series data charts. In this paper, we identify summary elements and common hallucination types in the generation of time-se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09349v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09349v1-abstract-full" style="display: none;"> Effective chart summary can significantly reduce the time and effort decision makers spend interpreting charts, enabling precise and efficient communication of data insights. Previous studies have faced challenges in generating accurate and semantically rich summaries of time-series data charts. In this paper, we identify summary elements and common hallucination types in the generation of time-series chart summaries, which serve as our guidelines for automatic generation. We introduce ChartInsighter, which automatically generates chart summaries of time-series data, effectively reducing hallucinations in chart summary generation. Specifically, we assign multiple agents to generate the initial chart summary and collaborate iteratively, during which they invoke external data analysis modules to extract insights and compile them into a coherent summary. Additionally, we implement a self-consistency test method to validate and correct our summary. We create a high-quality benchmark of charts and summaries, with hallucination types annotated on a sentence-by-sentence basis, facilitating the evaluation of the effectiveness of reducing hallucinations. Our evaluations using our benchmark show that our method surpasses state-of-the-art models, and that our summary hallucination rate is the lowest, which effectively reduces various hallucinations and improves summary quality. The benchmark is available at https://github.com/wangfen01/ChartInsighter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09349v1-abstract-full').style.display = 'none'; document.getElementById('2501.09349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04928">arXiv:2501.04928</a> <span> [<a href="https://arxiv.org/pdf/2501.04928">pdf</a>, <a href="https://arxiv.org/format/2501.04928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Image2CADSeq: Computer-Aided Design Sequence and Knowledge Inference from Product Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingang Li</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhenghui Sha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04928v1-abstract-short" style="display: inline;"> Computer-aided design (CAD) tools empower designers to design and modify 3D models through a series of CAD operations, commonly referred to as a CAD sequence. In scenarios where digital CAD files are not accessible, reverse engineering (RE) has been used to reconstruct 3D CAD models. Recent advances have seen the rise of data-driven approaches for RE, with a primary focus on converting 3D data, su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04928v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04928v1-abstract-full" style="display: none;"> Computer-aided design (CAD) tools empower designers to design and modify 3D models through a series of CAD operations, commonly referred to as a CAD sequence. In scenarios where digital CAD files are not accessible, reverse engineering (RE) has been used to reconstruct 3D CAD models. Recent advances have seen the rise of data-driven approaches for RE, with a primary focus on converting 3D data, such as point clouds, into 3D models in boundary representation (B-rep) format. However, obtaining 3D data poses significant challenges, and B-rep models do not reveal knowledge about the 3D modeling process of designs. To this end, our research introduces a novel data-driven approach with an Image2CADSeq neural network model. This model aims to reverse engineer CAD models by processing images as input and generating CAD sequences. These sequences can then be translated into B-rep models using a solid modeling kernel. Unlike B-rep models, CAD sequences offer enhanced flexibility to modify individual steps of model creation, providing a deeper understanding of the construction process of CAD models. To quantitatively and rigorously evaluate the predictive performance of the Image2CADSeq model, we have developed a multi-level evaluation framework for model assessment. The model was trained on a specially synthesized dataset, and various network architectures were explored to optimize the performance. The experimental and validation results show great potential for the model in generating CAD sequences from 2D image data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04928v1-abstract-full').style.display = 'none'; document.getElementById('2501.04928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 10 figures, and 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04377">arXiv:2501.04377</a> <span> [<a href="https://arxiv.org/pdf/2501.04377">pdf</a>, <a href="https://arxiv.org/format/2501.04377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> On Computational Limits and Provably Efficient Criteria of Visual Autoregressive Models: A Fine-Grained Complexity Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ke%2C+Y">Yekun Ke</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04377v2-abstract-short" style="display: inline;"> Recently, Visual Autoregressive ($\mathsf{VAR}$) Models introduced a groundbreaking advancement in the field of image generation, offering a scalable approach through a coarse-to-fine ``next-scale prediction'' paradigm. Suppose that $n$ represents the height and width of the last VQ code map generated by $\mathsf{VAR}$ models, the state-of-the-art algorithm in [Tian, Jiang, Yuan, Peng and Wang, Ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04377v2-abstract-full').style.display = 'inline'; document.getElementById('2501.04377v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04377v2-abstract-full" style="display: none;"> Recently, Visual Autoregressive ($\mathsf{VAR}$) Models introduced a groundbreaking advancement in the field of image generation, offering a scalable approach through a coarse-to-fine ``next-scale prediction'' paradigm. Suppose that $n$ represents the height and width of the last VQ code map generated by $\mathsf{VAR}$ models, the state-of-the-art algorithm in [Tian, Jiang, Yuan, Peng and Wang, NeurIPS 2024] takes $O(n^{4+o(1)})$ time, which is computationally inefficient. In this work, we analyze the computational limits and efficiency criteria of $\mathsf{VAR}$ Models through a fine-grained complexity lens. Our key contribution is identifying the conditions under which $\mathsf{VAR}$ computations can achieve sub-quadratic time complexity. We have proved that assuming the Strong Exponential Time Hypothesis ($\mathsf{SETH}$) from fine-grained complexity theory, a sub-quartic time algorithm for $\mathsf{VAR}$ models is impossible. To substantiate our theoretical findings, we present efficient constructions leveraging low-rank approximations that align with the derived criteria. This work initiates the study of the computational efficiency of the $\mathsf{VAR}$ model from a theoretical perspective. Our technique will shed light on advancing scalable and efficient image generation in $\mathsf{VAR}$ frameworks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04377v2-abstract-full').style.display = 'none'; document.getElementById('2501.04377v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19437">arXiv:2412.19437</a> <span> [<a href="https://arxiv.org/pdf/2412.19437">pdf</a>, <a href="https://arxiv.org/format/2412.19437">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-V3 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chenggang Zhao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chengqi Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+C">Chong Ruan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Damai Dai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Deli Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Dongjie Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erhang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fangyun Lin</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+F">Fucong Dai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+F">Fuli Luo</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+G">Guangbo Hao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanting Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guowei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">H. Zhang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Han Bao</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19437v2-abstract-short" style="display: inline;"> We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for loa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19437v2-abstract-full').style.display = 'inline'; document.getElementById('2412.19437v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19437v2-abstract-full" style="display: none;"> We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance, DeepSeek-V3 requires only 2.788M H800 GPU hours for its full training. In addition, its training process is remarkably stable. Throughout the entire training process, we did not experience any irrecoverable loss spikes or perform any rollbacks. The model checkpoints are available at https://github.com/deepseek-ai/DeepSeek-V3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19437v2-abstract-full').style.display = 'none'; document.getElementById('2412.19437v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16720">arXiv:2412.16720</a> <span> [<a href="https://arxiv.org/pdf/2412.16720">pdf</a>, <a href="https://arxiv.org/format/2412.16720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OpenAI o1 System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Jaech%2C+A">Aaron Jaech</a>, <a href="/search/cs?searchtype=author&query=Kalai%2C+A">Adam Kalai</a>, <a href="/search/cs?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&query=Richardson%2C+A">Adam Richardson</a>, <a href="/search/cs?searchtype=author&query=El-Kishky%2C+A">Ahmed El-Kishky</a>, <a href="/search/cs?searchtype=author&query=Low%2C+A">Aiden Low</a>, <a href="/search/cs?searchtype=author&query=Helyar%2C+A">Alec Helyar</a>, <a href="/search/cs?searchtype=author&query=Madry%2C+A">Aleksander Madry</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&query=Iftimie%2C+A">Alex Iftimie</a>, <a href="/search/cs?searchtype=author&query=Karpenko%2C+A">Alex Karpenko</a>, <a href="/search/cs?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&query=Neitz%2C+A">Alexander Neitz</a>, <a href="/search/cs?searchtype=author&query=Prokofiev%2C+A">Alexander Prokofiev</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+A">Alexander Wei</a>, <a href="/search/cs?searchtype=author&query=Tam%2C+A">Allison Tam</a>, <a href="/search/cs?searchtype=author&query=Bennett%2C+A">Ally Bennett</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Ananya Kumar</a>, <a href="/search/cs?searchtype=author&query=Saraiva%2C+A">Andre Saraiva</a>, <a href="/search/cs?searchtype=author&query=Vallone%2C+A">Andrea Vallone</a>, <a href="/search/cs?searchtype=author&query=Duberstein%2C+A">Andrew Duberstein</a>, <a href="/search/cs?searchtype=author&query=Kondrich%2C+A">Andrew Kondrich</a> , et al. (238 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16720v1-abstract-short" style="display: inline;"> The o1 model series is trained with large-scale reinforcement learning to reason using chain of thought. These advanced reasoning capabilities provide new avenues for improving the safety and robustness of our models. In particular, our models can reason about our safety policies in context when responding to potentially unsafe prompts, through deliberative alignment. This leads to state-of-the-ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16720v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16720v1-abstract-full" style="display: none;"> The o1 model series is trained with large-scale reinforcement learning to reason using chain of thought. These advanced reasoning capabilities provide new avenues for improving the safety and robustness of our models. In particular, our models can reason about our safety policies in context when responding to potentially unsafe prompts, through deliberative alignment. This leads to state-of-the-art performance on certain benchmarks for risks such as generating illicit advice, choosing stereotyped responses, and succumbing to known jailbreaks. Training models to incorporate a chain of thought before answering has the potential to unlock substantial benefits, while also increasing potential risks that stem from heightened intelligence. Our results underscore the need for building robust alignment methods, extensively stress-testing their efficacy, and maintaining meticulous risk management protocols. This report outlines the safety work carried out for the OpenAI o1 and OpenAI o1-mini models, including safety evaluations, external red teaming, and Preparedness Framework evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16720v1-abstract-full').style.display = 'none'; document.getElementById('2412.16720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13647">arXiv:2412.13647</a> <span> [<a href="https://arxiv.org/pdf/2412.13647">pdf</a>, <a href="https://arxiv.org/format/2412.13647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> G-VEval: A Versatile Metric for Evaluating Image and Video Captions Using GPT-4o </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tong%2C+T+C">Tony Cheng Tong</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Sirui He</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhiwen Shao</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+D">Dit-Yan Yeung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13647v2-abstract-short" style="display: inline;"> Evaluation metric of visual captioning is important yet not thoroughly explored. Traditional metrics like BLEU, METEOR, CIDEr, and ROUGE often miss semantic depth, while trained metrics such as CLIP-Score, PAC-S, and Polos are limited in zero-shot scenarios. Advanced Language Model-based metrics also struggle with aligning to nuanced human preferences. To address these issues, we introduce G-VEval… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13647v2-abstract-full').style.display = 'inline'; document.getElementById('2412.13647v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13647v2-abstract-full" style="display: none;"> Evaluation metric of visual captioning is important yet not thoroughly explored. Traditional metrics like BLEU, METEOR, CIDEr, and ROUGE often miss semantic depth, while trained metrics such as CLIP-Score, PAC-S, and Polos are limited in zero-shot scenarios. Advanced Language Model-based metrics also struggle with aligning to nuanced human preferences. To address these issues, we introduce G-VEval, a novel metric inspired by G-Eval and powered by the new GPT-4o. G-VEval uses chain-of-thought reasoning in large multimodal models and supports three modes: reference-free, reference-only, and combined, accommodating both video and image inputs. We also propose MSVD-Eval, a new dataset for video captioning evaluation, to establish a more transparent and consistent framework for both human experts and evaluation metrics. It is designed to address the lack of clear criteria in existing datasets by introducing distinct dimensions of Accuracy, Completeness, Conciseness, and Relevance (ACCR). Extensive results show that G-VEval outperforms existing methods in correlation with human annotations, as measured by Kendall tau-b and Kendall tau-c. This provides a flexible solution for diverse captioning tasks and suggests a straightforward yet effective approach for large language models to understand video content, paving the way for advancements in automated captioning. Codes are available at https://github.com/ztangaj/gveval <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13647v2-abstract-full').style.display = 'none'; document.getElementById('2412.13647v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09972">arXiv:2412.09972</a> <span> [<a href="https://arxiv.org/pdf/2412.09972">pdf</a>, <a href="https://arxiv.org/format/2412.09972">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Efficient Large-Scale Traffic Forecasting with Transformers: A Spatial Data Management Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuchen Fang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yuxuan Liang</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+B">Bo Hui</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zezhi Shao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+L">Liwei Deng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xu Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xinke Jiang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kai Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09972v2-abstract-short" style="display: inline;"> Road traffic forecasting is crucial in real-world intelligent transportation scenarios like traffic dispatching and path planning in city management and personal traveling. Spatio-temporal graph neural networks (STGNNs) stand out as the mainstream solution in this task. Nevertheless, the quadratic complexity of remarkable dynamic spatial modeling-based STGNNs has become the bottleneck over large-s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09972v2-abstract-full').style.display = 'inline'; document.getElementById('2412.09972v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09972v2-abstract-full" style="display: none;"> Road traffic forecasting is crucial in real-world intelligent transportation scenarios like traffic dispatching and path planning in city management and personal traveling. Spatio-temporal graph neural networks (STGNNs) stand out as the mainstream solution in this task. Nevertheless, the quadratic complexity of remarkable dynamic spatial modeling-based STGNNs has become the bottleneck over large-scale traffic data. From the spatial data management perspective, we present a novel Transformer framework called PatchSTG to efficiently and dynamically model spatial dependencies for large-scale traffic forecasting with interpretability and fidelity. Specifically, we design a novel irregular spatial patching to reduce the number of points involved in the dynamic calculation of Transformer. The irregular spatial patching first utilizes the leaf K-dimensional tree (KDTree) to recursively partition irregularly distributed traffic points into leaf nodes with a small capacity, and then merges leaf nodes belonging to the same subtree into occupancy-equaled and non-overlapped patches through padding and backtracking. Based on the patched data, depth and breadth attention are used interchangeably in the encoder to dynamically learn local and global spatial knowledge from points in a patch and points with the same index of patches. Experimental results on four real world large-scale traffic datasets show that our PatchSTG achieves train speed and memory utilization improvements up to $10\times$ and $4\times$ with the state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09972v2-abstract-full').style.display = 'none'; document.getElementById('2412.09972v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SIGKDD 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09417">arXiv:2412.09417</a> <span> [<a href="https://arxiv.org/pdf/2412.09417">pdf</a>, <a href="https://arxiv.org/format/2412.09417">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Reinforcement Learning Within the Classical Robotics Stack: A Case Study in Robot Soccer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Labiosa%2C+A">Adam Labiosa</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhihan Wang</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+S">Siddhant Agarwal</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+W">William Cong</a>, <a href="/search/cs?searchtype=author&query=Hemkumar%2C+G">Geethika Hemkumar</a>, <a href="/search/cs?searchtype=author&query=Harish%2C+A+N">Abhinav Narayan Harish</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+B">Benjamin Hong</a>, <a href="/search/cs?searchtype=author&query=Kelle%2C+J">Josh Kelle</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuhao Li</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zisen Shao</a>, <a href="/search/cs?searchtype=author&query=Stone%2C+P">Peter Stone</a>, <a href="/search/cs?searchtype=author&query=Hanna%2C+J+P">Josiah P. Hanna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09417v1-abstract-short" style="display: inline;"> Robot decision-making in partially observable, real-time, dynamic, and multi-agent environments remains a difficult and unsolved challenge. Model-free reinforcement learning (RL) is a promising approach to learning decision-making in such domains, however, end-to-end RL in complex environments is often intractable. To address this challenge in the RoboCup Standard Platform League (SPL) domain, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09417v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09417v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09417v1-abstract-full" style="display: none;"> Robot decision-making in partially observable, real-time, dynamic, and multi-agent environments remains a difficult and unsolved challenge. Model-free reinforcement learning (RL) is a promising approach to learning decision-making in such domains, however, end-to-end RL in complex environments is often intractable. To address this challenge in the RoboCup Standard Platform League (SPL) domain, we developed a novel architecture integrating RL within a classical robotics stack, while employing a multi-fidelity sim2real approach and decomposing behavior into learned sub-behaviors with heuristic selection. Our architecture led to victory in the 2024 RoboCup SPL Challenge Shield Division. In this work, we fully describe our system's architecture and empirically analyze key design decisions that contributed to its success. Our approach demonstrates how RL-based behaviors can be integrated into complete robot behavior architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09417v1-abstract-full').style.display = 'none'; document.getElementById('2412.09417v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00811">arXiv:2412.00811</a> <span> [<a href="https://arxiv.org/pdf/2412.00811">pdf</a>, <a href="https://arxiv.org/format/2412.00811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Vid-Morp: Video Moment Retrieval Pretraining from Unlabeled Videos in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+P">Peijun Bao</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chenqi Kong</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zihao Shao</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+B+P">Boon Poh Ng</a>, <a href="/search/cs?searchtype=author&query=Er%2C+M+H">Meng Hwa Er</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00811v1-abstract-short" style="display: inline;"> Given a natural language query, video moment retrieval aims to localize the described temporal moment in an untrimmed video. A major challenge of this task is its heavy dependence on labor-intensive annotations for training. Unlike existing works that directly train models on manually curated data, we propose a novel paradigm to reduce annotation costs: pretraining the model on unlabeled, real-wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00811v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00811v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00811v1-abstract-full" style="display: none;"> Given a natural language query, video moment retrieval aims to localize the described temporal moment in an untrimmed video. A major challenge of this task is its heavy dependence on labor-intensive annotations for training. Unlike existing works that directly train models on manually curated data, we propose a novel paradigm to reduce annotation costs: pretraining the model on unlabeled, real-world videos. To support this, we introduce Video Moment Retrieval Pretraining (Vid-Morp), a large-scale dataset collected with minimal human intervention, consisting of over 50K videos captured in the wild and 200K pseudo annotations. Direct pretraining on these imperfect pseudo annotations, however, presents significant challenges, including mismatched sentence-video pairs and imprecise temporal boundaries. To address these issues, we propose the ReCorrect algorithm, which comprises two main phases: semantics-guided refinement and memory-consensus correction. The semantics-guided refinement enhances the pseudo labels by leveraging semantic similarity with video frames to clean out unpaired data and make initial adjustments to temporal boundaries. In the following memory-consensus correction phase, a memory bank tracks the model predictions, progressively correcting the temporal boundaries based on consensus within the memory. Comprehensive experiments demonstrate ReCorrect's strong generalization abilities across multiple downstream settings. Zero-shot ReCorrect achieves over 75% and 80% of the best fully-supervised performance on two benchmarks, while unsupervised ReCorrect reaches about 85% on both. The code, dataset, and pretrained models are available at https://github.com/baopj/Vid-Morp. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00811v1-abstract-full').style.display = 'none'; document.getElementById('2412.00811v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16327">arXiv:2411.16327</a> <span> [<a href="https://arxiv.org/pdf/2411.16327">pdf</a>, <a href="https://arxiv.org/format/2411.16327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CapHDR2IR: Caption-Driven Transfer from Visible Light to Infrared Domain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jingchao Peng</a>, <a href="/search/cs?searchtype=author&query=Bashford-Rogers%2C+T">Thomas Bashford-Rogers</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhuang Shao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haitao Zhao</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A+R">Aru Ranjan Singh</a>, <a href="/search/cs?searchtype=author&query=Goswami%2C+A">Abhishek Goswami</a>, <a href="/search/cs?searchtype=author&query=Debattista%2C+K">Kurt Debattista</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16327v1-abstract-short" style="display: inline;"> Infrared (IR) imaging offers advantages in several fields due to its unique ability of capturing content in extreme light conditions. However, the demanding hardware requirements of high-resolution IR sensors limit its widespread application. As an alternative, visible light can be used to synthesize IR images but this causes a loss of fidelity in image details and introduces inconsistencies due t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16327v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16327v1-abstract-full" style="display: none;"> Infrared (IR) imaging offers advantages in several fields due to its unique ability of capturing content in extreme light conditions. However, the demanding hardware requirements of high-resolution IR sensors limit its widespread application. As an alternative, visible light can be used to synthesize IR images but this causes a loss of fidelity in image details and introduces inconsistencies due to lack of contextual awareness of the scene. This stems from a combination of using visible light with a standard dynamic range, especially under extreme lighting, and a lack of contextual awareness can result in pseudo-thermal-crossover artifacts. This occurs when multiple objects with similar temperatures appear indistinguishable in the training data, further exacerbating the loss of fidelity. To solve this challenge, this paper proposes CapHDR2IR, a novel framework incorporating vision-language models using high dynamic range (HDR) images as inputs to generate IR images. HDR images capture a wider range of luminance variations, ensuring reliable IR image generation in different light conditions. Additionally, a dense caption branch integrates semantic understanding, resulting in more meaningful and discernible IR outputs. Extensive experiments on the HDRT dataset show that the proposed CapHDR2IR achieves state-of-the-art performance compared with existing general domain transfer methods and those tailored for visible-to-infrared image translation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16327v1-abstract-full').style.display = 'none'; document.getElementById('2411.16327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15799">arXiv:2411.15799</a> <span> [<a href="https://arxiv.org/pdf/2411.15799">pdf</a>, <a href="https://arxiv.org/format/2411.15799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s10489-024-05849-5">10.1007/s10489-024-05849-5 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Symmetric Perception and Ordinal Regression for Detecting Scoliosis Natural Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaojia Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Rui Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xiaoqi Guo</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhiwen Shao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yuhu Dai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a>, <a href="/search/cs?searchtype=author&query=Lang%2C+C">Chuandong Lang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15799v1-abstract-short" style="display: inline;"> Scoliosis is one of the most common diseases in adolescents. Traditional screening methods for the scoliosis usually use radiographic examination, which requires certified experts with medical instruments and brings the radiation risk. Considering such requirement and inconvenience, we propose to use natural images of the human back for wide-range scoliosis screening, which is a challenging proble… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15799v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15799v1-abstract-full" style="display: none;"> Scoliosis is one of the most common diseases in adolescents. Traditional screening methods for the scoliosis usually use radiographic examination, which requires certified experts with medical instruments and brings the radiation risk. Considering such requirement and inconvenience, we propose to use natural images of the human back for wide-range scoliosis screening, which is a challenging problem. In this paper, we notice that the human back has a certain degree of symmetry, and asymmetrical human backs are usually caused by spinal lesions. Besides, scoliosis severity levels have ordinal relationships. Taking inspiration from this, we propose a dual-path scoliosis detection network with two main modules: symmetric feature matching module (SFMM) and ordinal regression head (ORH). Specifically, we first adopt a backbone to extract features from both the input image and its horizontally flipped image. Then, we feed the two extracted features into the SFMM to capture symmetric relationships. Finally, we use the ORH to transform the ordinal regression problem into a series of binary classification sub-problems. Extensive experiments demonstrate that our approach outperforms state-of-the-art methods as well as human performance, which provides a promising and economic solution to wide-range scoliosis screening. In particular, our method achieves accuracies of 95.11% and 81.46% in estimation of general severity level and fine-grained severity level of the scoliosis, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15799v1-abstract-full').style.display = 'none'; document.getElementById('2411.15799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by Applied Intelligence</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15332">arXiv:2411.15332</a> <span> [<a href="https://arxiv.org/pdf/2411.15332">pdf</a>, <a href="https://arxiv.org/format/2411.15332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Mera: Memory Reduction and Acceleration for Quantum Circuit Simulation via Redundancy Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuhong Song</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+E+H">Edwin Hsing-Mean Sha</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Longshan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhuge%2C+Q">Qingfeng Zhuge</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zili Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15332v1-abstract-short" style="display: inline;"> With the development of quantum computing, quantum processor demonstrates the potential supremacy in specific applications, such as Grovers database search and popular quantum neural networks (QNNs). For better calibrating the quantum algorithms and machines, quantum circuit simulation on classical computers becomes crucial. However, as the number of quantum bits (qubits) increases, the memory req… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15332v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15332v1-abstract-full" style="display: none;"> With the development of quantum computing, quantum processor demonstrates the potential supremacy in specific applications, such as Grovers database search and popular quantum neural networks (QNNs). For better calibrating the quantum algorithms and machines, quantum circuit simulation on classical computers becomes crucial. However, as the number of quantum bits (qubits) increases, the memory requirement grows exponentially. In order to reduce memory usage and accelerate simulation, we propose a multi-level optimization, namely Mera, by exploring memory and computation redundancy. First, for a large number of sparse quantum gates, we propose two compressed structures for low-level full-state simulation. The corresponding gate operations are designed for practical implementations, which are relieved from the longtime compression and decompression. Second, for the dense Hadamard gate, which is definitely used to construct the superposition, we design a customized structure for significant memory saving as a regularity-oriented simulation. Meanwhile, an ondemand amplitude updating process is optimized for execution acceleration. Experiments show that our compressed structures increase the number of qubits from 17 to 35, and achieve up to 6.9 times acceleration for QNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15332v1-abstract-full').style.display = 'none'; document.getElementById('2411.15332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2024 42nd IEEE International Conference on Computer Design (ICCD)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12604">arXiv:2411.12604</a> <span> [<a href="https://arxiv.org/pdf/2411.12604">pdf</a>, <a href="https://arxiv.org/ps/2411.12604">ps</a>, <a href="https://arxiv.org/format/2411.12604">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SG-LRA: Self-Generating Automatic Scoliosis Cobb Angle Measurement with Low-Rank Approximation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhiwen Shao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yichen Yuan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lizhuang Ma</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+D">Dit-Yan Yeung</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaojia Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12604v1-abstract-short" style="display: inline;"> Automatic Cobb angle measurement from X-ray images is crucial for scoliosis screening and diagnosis. However, most existing regression-based methods and segmentation-based methods struggle with inaccurate spine representations or mask connectivity/fragmentation issues. Besides, landmark-based methods suffer from insufficient training data and annotations. To address these challenges, we propose a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12604v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12604v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12604v1-abstract-full" style="display: none;"> Automatic Cobb angle measurement from X-ray images is crucial for scoliosis screening and diagnosis. However, most existing regression-based methods and segmentation-based methods struggle with inaccurate spine representations or mask connectivity/fragmentation issues. Besides, landmark-based methods suffer from insufficient training data and annotations. To address these challenges, we propose a novel framework including Self-Generation pipeline and Low-Rank Approximation representation (SG-LRA) for automatic Cobb angle measurement. Specifically, we propose a parameterized spine contour representation based on LRA, which enables eigen-spine decomposition and spine contour reconstruction. We can directly obtain spine contour with only regressed LRA coefficients, which form a more accurate spine representation than rectangular boxes. Also, we combine LRA coefficient regression with anchor box classification to solve inaccurate predictions and mask connectivity issues. Moreover, we develop a data engine with automatic annotation and automatic selection in an iterative manner, which is trained on a private Spinal2023 dataset. With our data engine, we generate the largest scoliosis X-ray dataset named Spinal-AI2024 largely without privacy leaks. Extensive experiments on public AASCE2019, private Spinal2023, and generated Spinal-AI2024 datasets demonstrate that our method achieves state-of-the-art Cobb angle measurement performance. Our code and Spinal-AI2024 dataset are available at https://github.com/Ernestchenchen/SG-LRA and https://github.com/Ernestchenchen/Spinal-AI2024, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12604v1-abstract-full').style.display = 'none'; document.getElementById('2411.12604v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07722">arXiv:2411.07722</a> <span> [<a href="https://arxiv.org/pdf/2411.07722">pdf</a>, <a href="https://arxiv.org/format/2411.07722">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Is Cognition consistent with Perception? Assessing and Mitigating Multimodal Knowledge Conflicts in Document Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zirui Shao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chuwei Luo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhaoqing Zhu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+H">Hangdi Xing</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhi Yu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qi Zheng</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiajun Bu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07722v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have shown impressive capabilities in document understanding, a rapidly growing research area with significant industrial demand in recent years. As a multimodal task, document understanding requires models to possess both perceptual and cognitive abilities. However, current MLLMs often face conflicts between perception and cognition. Taking a document VQA… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07722v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07722v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07722v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have shown impressive capabilities in document understanding, a rapidly growing research area with significant industrial demand in recent years. As a multimodal task, document understanding requires models to possess both perceptual and cognitive abilities. However, current MLLMs often face conflicts between perception and cognition. Taking a document VQA task (cognition) as an example, an MLLM might generate answers that do not match the corresponding visual content identified by its OCR (perception). This conflict suggests that the MLLM might struggle to establish an intrinsic connection between the information it "sees" and what it "understands." Such conflicts challenge the intuitive notion that cognition is consistent with perception, hindering the performance and explainability of MLLMs. In this paper, we define the conflicts between cognition and perception as Cognition and Perception (C&P) knowledge conflicts, a form of multimodal knowledge conflicts, and systematically assess them with a focus on document understanding. Our analysis reveals that even GPT-4o, a leading MLLM, achieves only 68.6% C&P consistency. To mitigate the C&P knowledge conflicts, we propose a novel method called Multimodal Knowledge Consistency Fine-tuning. This method first ensures task-specific consistency and then connects the cognitive and perceptual knowledge. Our method significantly reduces C&P knowledge conflicts across all tested MLLMs and enhances their performance in both cognitive and perceptual tasks in most scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07722v1-abstract-full').style.display = 'none'; document.getElementById('2411.07722v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04672">arXiv:2411.04672</a> <span> [<a href="https://arxiv.org/pdf/2411.04672">pdf</a>, <a href="https://arxiv.org/format/2411.04672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Semantic-Aware Resource Management for C-V2X Platooning via Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhiyu Shao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiong Wu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+P">Pingyi Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kezhi Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Q">Qiang Fan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen Chen</a>, <a href="/search/cs?searchtype=author&query=Letaief%2C+K+B">Khaled B. Letaief</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04672v1-abstract-short" style="display: inline;"> This paper presents a semantic-aware multi-modal resource allocation (SAMRA) for multi-task using multi-agent reinforcement learning (MARL), termed SAMRAMARL, utilizing in platoon systems where cellular vehicle-to-everything (C-V2X) communication is employed. The proposed approach leverages the semantic information to optimize the allocation of communication resources. By integrating a distributed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04672v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04672v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04672v1-abstract-full" style="display: none;"> This paper presents a semantic-aware multi-modal resource allocation (SAMRA) for multi-task using multi-agent reinforcement learning (MARL), termed SAMRAMARL, utilizing in platoon systems where cellular vehicle-to-everything (C-V2X) communication is employed. The proposed approach leverages the semantic information to optimize the allocation of communication resources. By integrating a distributed multi-agent reinforcement learning (MARL) algorithm, SAMRAMARL enables autonomous decision-making for each vehicle, channel assignment optimization, power allocation, and semantic symbol length based on the contextual importance of the transmitted information. This semantic-awareness ensures that both vehicle-to-vehicle (V2V) and vehicle-to-infrastructure (V2I) communications prioritize data that is critical for maintaining safe and efficient platoon operations. The framework also introduces a tailored quality of experience (QoE) metric for semantic communication, aiming to maximize QoE in V2V links while improving the success rate of semantic information transmission (SRS). Extensive simulations has demonstrated that SAMRAMARL outperforms existing methods, achieving significant gains in QoE and communication efficiency in C-V2X platooning scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04672v1-abstract-full').style.display = 'none'; document.getElementById('2411.04672v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been submitted to IEEE Journal. The source code has been released at:https://github.com/qiongwu86/Semantic-Aware-Resource-Management-for-C-V2X-Platooning-via-Multi-Agent-Reinforcement-Learning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00499">arXiv:2411.00499</a> <span> [<a href="https://arxiv.org/pdf/2411.00499">pdf</a>, <a href="https://arxiv.org/format/2411.00499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Cross-modal semantic segmentation for indoor environmental perception using single-chip millimeter-wave radar raw data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hairuo Hu</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+H">Haiyong Cong</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhuyu Shao</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Y">Yubo Bi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinghao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00499v2-abstract-short" style="display: inline;"> In the context of firefighting and rescue operations, a cross-modal semantic segmentation model based on a single-chip millimeter-wave (mmWave) radar for indoor environmental perception is proposed and discussed. To efficiently obtain high-quality labels, an automatic label generation method utilizing LiDAR point clouds and occupancy grid maps is introduced. The proposed segmentation model is base… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00499v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00499v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00499v2-abstract-full" style="display: none;"> In the context of firefighting and rescue operations, a cross-modal semantic segmentation model based on a single-chip millimeter-wave (mmWave) radar for indoor environmental perception is proposed and discussed. To efficiently obtain high-quality labels, an automatic label generation method utilizing LiDAR point clouds and occupancy grid maps is introduced. The proposed segmentation model is based on U-Net. A spatial attention module is incorporated, which enhanced the performance of the mode. The results demonstrate that cross-modal semantic segmentation provides a more intuitive and accurate representation of indoor environments. Unlike traditional methods, the model's segmentation performance is minimally affected by azimuth. Although performance declines with increasing distance, this can be mitigated by a well-designed model. Additionally, it was found that using raw ADC data as input is ineffective; compared to RA tensors, RD tensors are more suitable for the proposed model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00499v2-abstract-full').style.display = 'none'; document.getElementById('2411.00499v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5291 words, 17 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19933">arXiv:2410.19933</a> <span> [<a href="https://arxiv.org/pdf/2410.19933">pdf</a>, <a href="https://arxiv.org/format/2410.19933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Safety in Reinforcement Learning with Human Feedback via Rectified Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiyue Peng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hengquan Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+D">Dongqing Zou</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Ziyu Shao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Honghao Wei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19933v1-abstract-short" style="display: inline;"> Balancing helpfulness and safety (harmlessness) is a critical challenge in aligning large language models (LLMs). Current approaches often decouple these two objectives, training separate preference models for helpfulness and safety, while framing safety as a constraint within a constrained Markov Decision Process (CMDP) framework. However, these methods can lead to ``safety interference'', where… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19933v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19933v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19933v1-abstract-full" style="display: none;"> Balancing helpfulness and safety (harmlessness) is a critical challenge in aligning large language models (LLMs). Current approaches often decouple these two objectives, training separate preference models for helpfulness and safety, while framing safety as a constraint within a constrained Markov Decision Process (CMDP) framework. However, these methods can lead to ``safety interference'', where average-based safety constraints compromise the safety of some prompts in favor of others. To address this issue, we propose \textbf{Rectified Policy Optimization (RePO)}, which replaces the average safety constraint with stricter (per prompt) safety constraints. At the core of RePO is a policy update mechanism driven by rectified policy gradients, which penalizes the strict safety violation of every prompt, thereby enhancing safety across nearly all prompts. Our experiments on Alpaca-7B demonstrate that RePO improves the safety alignment and reduces the safety interference compared to baseline methods. Code is available at https://github.com/pxyWaterMoon/RePO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19933v1-abstract-full').style.display = 'none'; document.getElementById('2410.19933v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17513">arXiv:2410.17513</a> <span> [<a href="https://arxiv.org/pdf/2410.17513">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> HCDN: A Change Detection Network for Construction Housekeeping Using Feature Fusion and Large Vision Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+K">Kailai Sun</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zherui Shao</a>, <a href="/search/cs?searchtype=author&query=Goh%2C+Y+M">Yang Miang Goh</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jing Tian</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+V+J+L">Vincent J. L. Gan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17513v1-abstract-short" style="display: inline;"> Workplace safety has received increasing attention as millions of workers worldwide suffer from work-related accidents. Despite poor housekeeping is a significant contributor to construction accidents, there remains a significant lack of technological research focused on improving housekeeping practices in construction sites. Recognizing and locating poor housekeeping in a dynamic construction sit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17513v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17513v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17513v1-abstract-full" style="display: none;"> Workplace safety has received increasing attention as millions of workers worldwide suffer from work-related accidents. Despite poor housekeeping is a significant contributor to construction accidents, there remains a significant lack of technological research focused on improving housekeeping practices in construction sites. Recognizing and locating poor housekeeping in a dynamic construction site is an important task that can be improved through computer vision approaches. Despite advances in AI and computer vision, existing methods for detecting poor housekeeping conditions face many challenges, including limited explanations, lack of locating of poor housekeeping, and lack of annotated datasets. On the other hand, change detection which aims to detect the changed environmental conditions (e.g., changing from good to poor housekeeping) and 'where' the change has occurred (e.g., location of objects causing poor housekeeping), has not been explored to the problem of housekeeping management. To address these challenges, we propose the Housekeeping Change Detection Network (HCDN), an advanced change detection neural network that integrates a feature fusion module and a large vision model, achieving state-of-the-art performance. Additionally, we introduce the approach to establish a novel change detection dataset (named Housekeeping-CCD) focused on housekeeping in construction sites, along with a housekeeping segmentation dataset. Our contributions include significant performance improvements compared to existing methods, providing an effective tool for enhancing construction housekeeping and safety. To promote further development, we share our source code and trained models for global researchers: https://github.com/NUS-DBE/Housekeeping-CD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17513v1-abstract-full').style.display = 'none'; document.getElementById('2410.17513v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14827">arXiv:2410.14827</a> <span> [<a href="https://arxiv.org/pdf/2410.14827">pdf</a>, <a href="https://arxiv.org/format/2410.14827">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Making LLMs Vulnerable to Prompt Injection via Poisoning Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zedian Shao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongbin Liu</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+J">Jaden Mu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+N+Z">Neil Zhenqiang Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14827v1-abstract-short" style="display: inline;"> In a prompt injection attack, an attacker injects a prompt into the original one, aiming to make the LLM follow the injected prompt and perform a task chosen by the attacker. Existing prompt injection attacks primarily focus on how to blend the injected prompt into the original prompt without altering the LLM itself. Our experiments show that these attacks achieve some success, but there is still… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14827v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14827v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14827v1-abstract-full" style="display: none;"> In a prompt injection attack, an attacker injects a prompt into the original one, aiming to make the LLM follow the injected prompt and perform a task chosen by the attacker. Existing prompt injection attacks primarily focus on how to blend the injected prompt into the original prompt without altering the LLM itself. Our experiments show that these attacks achieve some success, but there is still significant room for improvement. In this work, we show that an attacker can boost the success of prompt injection attacks by poisoning the LLM's alignment process. Specifically, we propose PoisonedAlign, a method to strategically create poisoned alignment samples. When even a small fraction of the alignment data is poisoned using our method, the aligned LLM becomes more vulnerable to prompt injection while maintaining its foundational capabilities. The code is available at https://github.com/Sadcardation/PoisonedAlign <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14827v1-abstract-full').style.display = 'none'; document.getElementById('2410.14827v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11242">arXiv:2410.11242</a> <span> [<a href="https://arxiv.org/pdf/2410.11242">pdf</a>, <a href="https://arxiv.org/format/2410.11242">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Automatically Generating Visual Hallucination Test Cases for Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhongye Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongbin Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuepeng Hu</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zedian Shao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+N+Z">Neil Zhenqiang Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11242v1-abstract-short" style="display: inline;"> Visual hallucination (VH) occurs when a multimodal large language model (MLLM) generates responses with incorrect visual details for prompts. Existing methods for generating VH test cases primarily rely on human annotations, typically in the form of triples: (image, question, answer). In this paper, we introduce VHExpansion, the first automated method for expanding VH test cases for MLLMs. Given a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11242v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11242v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11242v1-abstract-full" style="display: none;"> Visual hallucination (VH) occurs when a multimodal large language model (MLLM) generates responses with incorrect visual details for prompts. Existing methods for generating VH test cases primarily rely on human annotations, typically in the form of triples: (image, question, answer). In this paper, we introduce VHExpansion, the first automated method for expanding VH test cases for MLLMs. Given an initial VH test case, VHExpansion automatically expands it by perturbing the question and answer through negation as well as modifying the image using both common and adversarial perturbations. Additionally, we propose a new evaluation metric, symmetric accuracy, which measures the proportion of correctly answered VH test-case pairs. Each pair consists of a test case and its negated counterpart. Our theoretical analysis shows that symmetric accuracy is an unbiased evaluation metric that remains unaffected by the imbalance of VH testing cases with varying answers when an MLLM is randomly guessing the answers, whereas traditional accuracy is prone to such imbalance. We apply VHExpansion to expand three VH datasets annotated manually and use these expanded datasets to benchmark seven MLLMs. Our evaluation shows that VHExpansion effectively identifies more VH test cases. Moreover, symmetric accuracy, being unbiased, leads to different conclusions about the vulnerability of MLLMs to VH compared to traditional accuracy metric. Finally, we show that fine-tuning MLLMs on the expanded VH dataset generated by VHExpansion mitigates VH more effectively than fine-tuning on the original, manually annotated dataset. Our code is available at: https://github.com/lycheeefish/VHExpansion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11242v1-abstract-full').style.display = 'none'; document.getElementById('2410.11242v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10165">arXiv:2410.10165</a> <span> [<a href="https://arxiv.org/pdf/2410.10165">pdf</a>, <a href="https://arxiv.org/format/2410.10165">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> HSR-Enhanced Sparse Attention Acceleration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10165v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable capabilities across various applications, but their performance on long-context tasks is often limited by the computational complexity of attention mechanisms. This paper introduces a novel approach to accelerate attention computation in LLMs, particularly for long-context scenarios. We leverage the inherent sparsity within attention mechan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10165v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10165v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10165v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable capabilities across various applications, but their performance on long-context tasks is often limited by the computational complexity of attention mechanisms. This paper introduces a novel approach to accelerate attention computation in LLMs, particularly for long-context scenarios. We leverage the inherent sparsity within attention mechanisms, both in conventional Softmax attention and ReLU attention (with $\mathsf{ReLU}^伪$ activation, $伪\in \mathbb{N}_+$), to significantly reduce the running time complexity. Our method employs a Half-Space Reporting (HSR) data structure to rapidly identify non-zero or "massively activated" entries in the attention matrix. We present theoretical analyses for two key scenarios: attention generation and full attention computation with long input context. Our approach achieves a running time of $O(mn^{4/5})$ significantly faster than the naive approach $O(mn)$ for attention generation, where $n$ is the context length, $m$ is the query length, and $d$ is the hidden dimension. We can also reduce the running time of full attention computation from $O(mn)$ to $O(mn^{1 - 1 / \lfloor d/2\rfloor} + mn^{4/5})$. Importantly, our method introduces no error for ReLU attention and only provably negligible error for Softmax attention, where the latter is supported by our empirical validation. This work represents a significant step towards enabling efficient long-context processing in LLMs, potentially broadening their applicability across various domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10165v1-abstract-full').style.display = 'none'; document.getElementById('2410.10165v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09375">arXiv:2410.09375</a> <span> [<a href="https://arxiv.org/pdf/2410.09375">pdf</a>, <a href="https://arxiv.org/ps/2410.09375">ps</a>, <a href="https://arxiv.org/format/2410.09375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> </div> </div> <p class="title is-5 mathjax"> Looped ReLU MLPs May Be All You Need as Practical Programmable Computers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yufa Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09375v2-abstract-short" style="display: inline;"> Previous work has demonstrated that attention mechanisms are Turing complete. More recently, it has been shown that a looped 9-layer Transformer can function as a universal programmable computer. In contrast, the multi-layer perceptrons with $\mathsf{ReLU}$ activation ($\mathsf{ReLU}$-$\mathsf{MLP}$), one of the most fundamental components of neural networks, is known to be expressive; specificall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09375v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09375v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09375v2-abstract-full" style="display: none;"> Previous work has demonstrated that attention mechanisms are Turing complete. More recently, it has been shown that a looped 9-layer Transformer can function as a universal programmable computer. In contrast, the multi-layer perceptrons with $\mathsf{ReLU}$ activation ($\mathsf{ReLU}$-$\mathsf{MLP}$), one of the most fundamental components of neural networks, is known to be expressive; specifically, a two-layer neural network is a universal approximator given an exponentially large number of hidden neurons. However, it remains unclear whether a $\mathsf{ReLU}$-$\mathsf{MLP}$ can be made into a universal programmable computer using a practical number of weights. In this work, we provide an affirmative answer that a looped 23-layer $\mathsf{ReLU}$-$\mathsf{MLP}$ is capable of performing the basic necessary operations, more efficiently and effectively functioning as a programmable computer than a looped Transformer. This indicates simple modules have stronger expressive power than previously expected and have not been fully explored. Our work provides insights into the mechanisms of neural networks and demonstrates that complex tasks, such as functioning as a programmable computer, do not necessarily require advanced architectures like Transformers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09375v2-abstract-full').style.display = 'none'; document.getElementById('2410.09375v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AIStats 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03268">arXiv:2410.03268</a> <span> [<a href="https://arxiv.org/pdf/2410.03268">pdf</a>, <a href="https://arxiv.org/format/2410.03268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Narrative Player: Reviving Data Narratives with Visuals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zekai Shao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Leixian Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haotian Li</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Yi Shan</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+H">Huamin Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yun Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siming Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03268v2-abstract-short" style="display: inline;"> Data-rich documents are commonly found across various fields such as business, finance, and science. However, a general limitation of these documents for reading is their reliance on text to convey data and facts. Visual representation of text aids in providing a satisfactory reading experience in comprehension and engagement. However, existing work emphasizes presenting the insights of local text… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03268v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03268v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03268v2-abstract-full" style="display: none;"> Data-rich documents are commonly found across various fields such as business, finance, and science. However, a general limitation of these documents for reading is their reliance on text to convey data and facts. Visual representation of text aids in providing a satisfactory reading experience in comprehension and engagement. However, existing work emphasizes presenting the insights of local text context, rather than fully conveying data stories within the whole paragraphs and engaging readers. To provide readers with satisfactory data stories, this paper presents Narrative Player, a novel method that automatically revives data narratives with consistent and contextualized visuals. Specifically, it accepts a paragraph and corresponding data table as input and leverages LLMs to characterize the clauses and extract contextualized data facts. Subsequently, the facts are transformed into a coherent visualization sequence with a carefully designed optimization-based approach. Animations are also assigned between adjacent visualizations to enable seamless transitions. Finally, the visualization sequence, transition animations, and audio narration generated by text-to-speech technologies are rendered into a data video. The evaluation results showed that the automatic-generated data videos were well-received by participants and experts for enhancing reading. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03268v2-abstract-full').style.display = 'none'; document.getElementById('2410.03268v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE TVCG</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01251">arXiv:2410.01251</a> <span> [<a href="https://arxiv.org/pdf/2410.01251">pdf</a>, <a href="https://arxiv.org/format/2410.01251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s11263-024-02258-6">10.1007/s11263-024-02258-6 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Facial Action Unit Detection by Adaptively Constraining Self-Attention and Causally Deconfounding Sample </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhiwen Shao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hancheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yong Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+X">Xiang Xiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bing Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+R">Rui Yao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lizhuang Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01251v1-abstract-short" style="display: inline;"> Facial action unit (AU) detection remains a challenging task, due to the subtlety, dynamics, and diversity of AUs. Recently, the prevailing techniques of self-attention and causal inference have been introduced to AU detection. However, most existing methods directly learn self-attention guided by AU detection, or employ common patterns for all AUs during causal intervention. The former often capt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01251v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01251v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01251v1-abstract-full" style="display: none;"> Facial action unit (AU) detection remains a challenging task, due to the subtlety, dynamics, and diversity of AUs. Recently, the prevailing techniques of self-attention and causal inference have been introduced to AU detection. However, most existing methods directly learn self-attention guided by AU detection, or employ common patterns for all AUs during causal intervention. The former often captures irrelevant information in a global range, and the latter ignores the specific causal characteristic of each AU. In this paper, we propose a novel AU detection framework called AC2D by adaptively constraining self-attention weight distribution and causally deconfounding the sample confounder. Specifically, we explore the mechanism of self-attention weight distribution, in which the self-attention weight distribution of each AU is regarded as spatial distribution and is adaptively learned under the constraint of location-predefined attention and the guidance of AU detection. Moreover, we propose a causal intervention module for each AU, in which the bias caused by training samples and the interference from irrelevant AUs are both suppressed. Extensive experiments show that our method achieves competitive performance compared to state-of-the-art AU detection approaches on challenging benchmarks, including BP4D, DISFA, GFT, and BP4D+ in constrained scenarios and Aff-Wild2 in unconstrained scenarios. The code is available at https://github.com/ZhiwenShao/AC2D. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01251v1-abstract-full').style.display = 'none'; document.getElementById('2410.01251v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by International Journal of Computer Vision</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05921">arXiv:2409.05921</a> <span> [<a href="https://arxiv.org/pdf/2409.05921">pdf</a>, <a href="https://arxiv.org/format/2409.05921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> STLLM-DF: A Spatial-Temporal Large Language Model with Diffusion for Enhanced Multi-Mode Traffic System Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhiqi Shao</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+H">Haoning Xi</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haohui Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ze Wang</a>, <a href="/search/cs?searchtype=author&query=Bell%2C+M+G+H">Michael G. H. Bell</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junbin Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05921v1-abstract-short" style="display: inline;"> The rapid advancement of Intelligent Transportation Systems (ITS) presents challenges, particularly with missing data in multi-modal transportation and the complexity of handling diverse sequential tasks within a centralized framework. To address these issues, we propose the Spatial-Temporal Large Language Model Diffusion (STLLM-DF), an innovative model that leverages Denoising Diffusion Probabili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05921v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05921v1-abstract-full" style="display: none;"> The rapid advancement of Intelligent Transportation Systems (ITS) presents challenges, particularly with missing data in multi-modal transportation and the complexity of handling diverse sequential tasks within a centralized framework. To address these issues, we propose the Spatial-Temporal Large Language Model Diffusion (STLLM-DF), an innovative model that leverages Denoising Diffusion Probabilistic Models (DDPMs) and Large Language Models (LLMs) to improve multi-task transportation prediction. The DDPM's robust denoising capabilities enable it to recover underlying data patterns from noisy inputs, making it particularly effective in complex transportation systems. Meanwhile, the non-pretrained LLM dynamically adapts to spatial-temporal relationships within multi-modal networks, allowing the system to efficiently manage diverse transportation tasks in both long-term and short-term predictions. Extensive experiments demonstrate that STLLM-DF consistently outperforms existing models, achieving an average reduction of 2.40\% in MAE, 4.50\% in RMSE, and 1.51\% in MAPE. This model significantly advances centralized ITS by enhancing predictive accuracy, robustness, and overall system performance across multiple tasks, thus paving the way for more effective spatio-temporal traffic forecasting through the integration of frozen transformer language models and diffusion techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05921v1-abstract-full').style.display = 'none'; document.getElementById('2409.05921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 11 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> I.2.7 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05099">arXiv:2409.05099</a> <span> [<a href="https://arxiv.org/pdf/2409.05099">pdf</a>, <a href="https://arxiv.org/format/2409.05099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> DreamMapping: High-Fidelity Text-to-3D Generation via Variational Distribution Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zeyu Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Duotun Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yixun Liang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhijing Shao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+X">Xiaohang Zhan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05099v4-abstract-short" style="display: inline;"> Score Distillation Sampling (SDS) has emerged as a prevalent technique for text-to-3D generation, enabling 3D content creation by distilling view-dependent information from text-to-2D guidance. However, they frequently exhibit shortcomings such as over-saturated color and excess smoothness. In this paper, we conduct a thorough analysis of SDS and refine its formulation, finding that the core desig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05099v4-abstract-full').style.display = 'inline'; document.getElementById('2409.05099v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05099v4-abstract-full" style="display: none;"> Score Distillation Sampling (SDS) has emerged as a prevalent technique for text-to-3D generation, enabling 3D content creation by distilling view-dependent information from text-to-2D guidance. However, they frequently exhibit shortcomings such as over-saturated color and excess smoothness. In this paper, we conduct a thorough analysis of SDS and refine its formulation, finding that the core design is to model the distribution of rendered images. Following this insight, we introduce a novel strategy called Variational Distribution Mapping (VDM), which expedites the distribution modeling process by regarding the rendered images as instances of degradation from diffusion-based generation. This special design enables the efficient training of variational distribution by skipping the calculations of the Jacobians in the diffusion U-Net. We also introduce timestep-dependent Distribution Coefficient Annealing (DCA) to further improve distilling precision. Leveraging VDM and DCA, we use Gaussian Splatting as the 3D representation and build a text-to-3D generation framework. Extensive experiments and evaluations demonstrate the capability of VDM and DCA to generate high-fidelity and realistic assets with optimization efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05099v4-abstract-full').style.display = 'none'; document.getElementById('2409.05099v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 14 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.9; I.3.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14158">arXiv:2408.14158</a> <span> [<a href="https://arxiv.org/pdf/2408.14158">pdf</a>, <a href="https://arxiv.org/format/2408.14158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fire-Flyer AI-HPC: A Cost-Effective Software-Hardware Co-Design for Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+W">Wei An</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+X">Xiao Bi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanting Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shanhuang Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chengqi Deng</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Honghui Ding</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+K">Kai Dong</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Q">Qiushi Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+W">Wenjun Gao</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+K">Kang Guan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jianzhong Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yongqiang Guo</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Z">Zhe Fu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Ying He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Panpan Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiashi Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wenfeng Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shanghao Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xuan Lu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+X">Xiaotao Nie</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+T">Tian Pei</a> , et al. (27 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14158v2-abstract-short" style="display: inline;"> The rapid progress in Deep Learning (DL) and Large Language Models (LLMs) has exponentially increased demands of computational power and bandwidth. This, combined with the high costs of faster computing chips and interconnects, has significantly inflated High Performance Computing (HPC) construction costs. To address these challenges, we introduce the Fire-Flyer AI-HPC architecture, a synergistic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14158v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14158v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14158v2-abstract-full" style="display: none;"> The rapid progress in Deep Learning (DL) and Large Language Models (LLMs) has exponentially increased demands of computational power and bandwidth. This, combined with the high costs of faster computing chips and interconnects, has significantly inflated High Performance Computing (HPC) construction costs. To address these challenges, we introduce the Fire-Flyer AI-HPC architecture, a synergistic hardware-software co-design framework and its best practices. For DL training, we deployed the Fire-Flyer 2 with 10,000 PCIe A100 GPUs, achieved performance approximating the DGX-A100 while reducing costs by half and energy consumption by 40%. We specifically engineered HFReduce to accelerate allreduce communication and implemented numerous measures to keep our Computation-Storage Integrated Network congestion-free. Through our software stack, including HaiScale, 3FS, and HAI-Platform, we achieved substantial scalability by overlapping computation and communication. Our system-oriented experience from DL training provides valuable insights to drive future advancements in AI-HPC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14158v2-abstract-full').style.display = 'none'; document.getElementById('2408.14158v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is the preprint version of the paper accepted for presentation at the 2024 International Conference for High Performance Computing, Networking, Storage, and Analysis (SC'24). \c{opyright} 2024 IEEE. Personal use of this material is permitted. For other uses, permission from IEEE must be obtained. Please refer to IEEE Xplore for the final published version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13233">arXiv:2408.13233</a> <span> [<a href="https://arxiv.org/pdf/2408.13233">pdf</a>, <a href="https://arxiv.org/ps/2408.13233">ps</a>, <a href="https://arxiv.org/format/2408.13233">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-Layer Transformers Gradient Can be Approximated in Almost Linear Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yufa Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13233v2-abstract-short" style="display: inline;"> The computational complexity of the self-attention mechanism in popular transformer architectures poses significant challenges for training and inference, and becomes the bottleneck for long inputs. Is it possible to significantly reduce the quadratic time complexity of computing the gradients in multi-layer transformer models? This paper proves that a novel fast approximation method can calculate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13233v2-abstract-full').style.display = 'inline'; document.getElementById('2408.13233v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13233v2-abstract-full" style="display: none;"> The computational complexity of the self-attention mechanism in popular transformer architectures poses significant challenges for training and inference, and becomes the bottleneck for long inputs. Is it possible to significantly reduce the quadratic time complexity of computing the gradients in multi-layer transformer models? This paper proves that a novel fast approximation method can calculate the gradients in almost linear time $n^{1+o(1)}$ where $n$ is the input sequence length, while it maintains a polynomially small approximation error $1 / \mathrm{poly}(n)$ across the entire model. Our theory holds for general loss functions and when the multi-layer transformer model contains many practical sub-modules, such as residual connection, casual mask, and multi-head attention. By improving the efficiency of gradient computation, we hope that this work will facilitate more effective training and deployment of long-context language models based on our theoretical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13233v2-abstract-full').style.display = 'none'; document.getElementById('2408.13233v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10588">arXiv:2408.10588</a> <span> [<a href="https://arxiv.org/pdf/2408.10588">pdf</a>, <a href="https://arxiv.org/format/2408.10588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> DEGAS: Detailed Expressions on Full-Body Gaussian Avatars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhijing Shao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Duotun Wang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qing-Yao Tian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yao-Dong Yang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+H">Hengyu Meng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zeyu Cai</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+B">Bo Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10588v2-abstract-short" style="display: inline;"> Although neural rendering has made significant advances in creating lifelike, animatable full-body and head avatars, incorporating detailed expressions into full-body avatars remains largely unexplored. We present DEGAS, the first 3D Gaussian Splatting (3DGS)-based modeling method for full-body avatars with rich facial expressions. Trained on multiview videos of a given subject, our method learns… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10588v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10588v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10588v2-abstract-full" style="display: none;"> Although neural rendering has made significant advances in creating lifelike, animatable full-body and head avatars, incorporating detailed expressions into full-body avatars remains largely unexplored. We present DEGAS, the first 3D Gaussian Splatting (3DGS)-based modeling method for full-body avatars with rich facial expressions. Trained on multiview videos of a given subject, our method learns a conditional variational autoencoder that takes both the body motion and facial expression as driving signals to generate Gaussian maps in the UV layout. To drive the facial expressions, instead of the commonly used 3D Morphable Models (3DMMs) in 3D head avatars, we propose to adopt the expression latent space trained solely on 2D portrait images, bridging the gap between 2D talking faces and 3D avatars. Leveraging the rendering capability of 3DGS and the rich expressiveness of the expression latent space, the learned avatars can be reenacted to reproduce photorealistic rendering images with subtle and accurate facial expressions. Experiments on an existing dataset and our newly proposed dataset of full-body talking avatars demonstrate the efficacy of our method. We also propose an audio-driven extension of our method with the help of 2D talking faces, opening new possibilities for interactive AI agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10588v2-abstract-full').style.display = 'none'; document.getElementById('2408.10588v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3DV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09695">arXiv:2408.09695</a> <span> [<a href="https://arxiv.org/pdf/2408.09695">pdf</a>, <a href="https://arxiv.org/format/2408.09695">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> LightWeather: Harnessing Absolute Positional Encoding to Efficient and Scalable Global Weather Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yisong Fu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zezhi Shao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chengqing Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yujie Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhao Chen</a>, <a href="/search/cs?searchtype=author&query=An%2C+Z">Zhulin An</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yongjun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09695v1-abstract-short" style="display: inline;"> Recently, Transformers have gained traction in weather forecasting for their capability to capture long-term spatial-temporal correlations. However, their complex architectures result in large parameter counts and extended training times, limiting their practical application and scalability to global-scale forecasting. This paper aims to explore the key factor for accurate weather forecasting and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09695v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09695v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09695v1-abstract-full" style="display: none;"> Recently, Transformers have gained traction in weather forecasting for their capability to capture long-term spatial-temporal correlations. However, their complex architectures result in large parameter counts and extended training times, limiting their practical application and scalability to global-scale forecasting. This paper aims to explore the key factor for accurate weather forecasting and design more efficient solutions. Interestingly, our empirical findings reveal that absolute positional encoding is what really works in Transformer-based weather forecasting models, which can explicitly model the spatial-temporal correlations even without attention mechanisms. We theoretically prove that its effectiveness stems from the integration of geographical coordinates and real-world time features, which are intrinsically related to the dynamics of weather. Based on this, we propose LightWeather, a lightweight and effective model for station-based global weather forecasting. We employ absolute positional encoding and a simple MLP in place of other components of Transformer. With under 30k parameters and less than one hour of training time, LightWeather achieves state-of-the-art performance on global weather datasets compared to other advanced DL methods. The results underscore the superiority of integrating spatial-temporal knowledge over complex architectures, providing novel insights for DL in weather forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09695v1-abstract-full').style.display = 'none'; document.getElementById('2408.09695v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08152">arXiv:2408.08152</a> <span> [<a href="https://arxiv.org/pdf/2408.08152">pdf</a>, <a href="https://arxiv.org/format/2408.08152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-Prover-V1.5: Harnessing Proof Assistant Feedback for Reinforcement Learning and Monte-Carlo Tree Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xin%2C+H">Huajian Xin</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Z+Z">Z. Z. Ren</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Junxiao Song</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhihong Shao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wanjia Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haocheng Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xuan Lu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Q">Qiushi Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+W">Wenjun Gao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+Z">Zhibin Gou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z+F">Z. F. Wu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+F">Fuli Luo</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+C">Chong Ruan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08152v1-abstract-short" style="display: inline;"> We introduce DeepSeek-Prover-V1.5, an open-source language model designed for theorem proving in Lean 4, which enhances DeepSeek-Prover-V1 by optimizing both training and inference processes. Pre-trained on DeepSeekMath-Base with specialization in formal mathematical languages, the model undergoes supervised fine-tuning using an enhanced formal theorem proving dataset derived from DeepSeek-Prover-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08152v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08152v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08152v1-abstract-full" style="display: none;"> We introduce DeepSeek-Prover-V1.5, an open-source language model designed for theorem proving in Lean 4, which enhances DeepSeek-Prover-V1 by optimizing both training and inference processes. Pre-trained on DeepSeekMath-Base with specialization in formal mathematical languages, the model undergoes supervised fine-tuning using an enhanced formal theorem proving dataset derived from DeepSeek-Prover-V1. Further refinement is achieved through reinforcement learning from proof assistant feedback (RLPAF). Beyond the single-pass whole-proof generation approach of DeepSeek-Prover-V1, we propose RMaxTS, a variant of Monte-Carlo tree search that employs an intrinsic-reward-driven exploration strategy to generate diverse proof paths. DeepSeek-Prover-V1.5 demonstrates significant improvements over DeepSeek-Prover-V1, achieving new state-of-the-art results on the test set of the high school level miniF2F benchmark ($63.5\%$) and the undergraduate level ProofNet benchmark ($25.3\%$). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08152v1-abstract-full').style.display = 'none'; document.getElementById('2408.08152v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06304">arXiv:2408.06304</a> <span> [<a href="https://arxiv.org/pdf/2408.06304">pdf</a>, <a href="https://arxiv.org/format/2408.06304">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Control-Flow Attestation: Concepts, Solutions, and Open Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhanyu Sha</a>, <a href="/search/cs?searchtype=author&query=Shepherd%2C+C">Carlton Shepherd</a>, <a href="/search/cs?searchtype=author&query=Rafi%2C+A">Amir Rafi</a>, <a href="/search/cs?searchtype=author&query=Markantonakis%2C+K">Konstantinos Markantonakis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06304v4-abstract-short" style="display: inline;"> Control-flow attestation unifies the worlds of control-flow integrity and platform attestation by measuring and reporting a target's run-time behaviour to a verifier. Trust assurances in the target are provided by testing whether its execution follows an authorised control-flow path. The problem has been explored in various settings, such as assessing the trustworthiness of cloud platforms, cyber-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06304v4-abstract-full').style.display = 'inline'; document.getElementById('2408.06304v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06304v4-abstract-full" style="display: none;"> Control-flow attestation unifies the worlds of control-flow integrity and platform attestation by measuring and reporting a target's run-time behaviour to a verifier. Trust assurances in the target are provided by testing whether its execution follows an authorised control-flow path. The problem has been explored in various settings, such as assessing the trustworthiness of cloud platforms, cyber-physical systems, and Internet of Things devices. Despite a significant number of proposals being made in recent years, the area remains fragmented, with different adversarial behaviours, verification paradigms, and deployment challenges being addressed. In this paper, we present the first survey of control-flow attestation, examining the core ideas and solutions in state-of-the-art schemes. In total, we survey over 30 papers published between 2016--2024, consolidate and compare their key features, and pose several challenges and recommendations for future research in the area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06304v4-abstract-full').style.display = 'none'; document.getElementById('2408.06304v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20570">arXiv:2407.20570</a> <span> [<a href="https://arxiv.org/pdf/2407.20570">pdf</a>, <a href="https://arxiv.org/format/2407.20570">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Fine-Tuned Large Language Model for Visualization System: A Study on Self-Regulated Learning in Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+L">Lin Gao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jing Lu</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zekai Shao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Ziyue Lin</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+S">Shengbin Yue</a>, <a href="/search/cs?searchtype=author&query=Ieong%2C+C">Chiokit Ieong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yi Sun</a>, <a href="/search/cs?searchtype=author&query=Zauner%2C+R+J">Rory James Zauner</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhongyu Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siming Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20570v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown great potential in intelligent visualization systems, especially for domain-specific applications. Integrating LLMs into visualization systems presents challenges, and we categorize these challenges into three alignments: domain problems with LLMs, visualization with LLMs, and interaction with LLMs. To achieve these alignments, we propose a framework and out… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20570v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20570v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20570v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown great potential in intelligent visualization systems, especially for domain-specific applications. Integrating LLMs into visualization systems presents challenges, and we categorize these challenges into three alignments: domain problems with LLMs, visualization with LLMs, and interaction with LLMs. To achieve these alignments, we propose a framework and outline a workflow to guide the application of fine-tuned LLMs to enhance visual interactions for domain-specific tasks. These alignment challenges are critical in education because of the need for an intelligent visualization system to support beginners' self-regulated learning. Therefore, we apply the framework to education and introduce Tailor-Mind, an interactive visualization system designed to facilitate self-regulated learning for artificial intelligence beginners. Drawing on insights from a preliminary study, we identify self-regulated learning tasks and fine-tuning objectives to guide visualization design and tuning data construction. Our focus on aligning visualization with fine-tuned LLM makes Tailor-Mind more like a personalized tutor. Tailor-Mind also supports interactive recommendations to help beginners better achieve their learning goals. Model performance evaluations and user studies confirm that Tailor-Mind improves the self-regulated learning experience, effectively validating the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20570v1-abstract-full').style.display = 'none'; document.getElementById('2407.20570v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15502">arXiv:2407.15502</a> <span> [<a href="https://arxiv.org/pdf/2407.15502">pdf</a>, <a href="https://arxiv.org/format/2407.15502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WebRPG: Automatic Web Rendering Parameters Generation for Visual Presentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zirui Shao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+F">Feiyu Gao</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+H">Hangdi Xing</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zepeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhi Yu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiajun Bu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qi Zheng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+C">Cong Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15502v1-abstract-short" style="display: inline;"> In the era of content creation revolution propelled by advancements in generative models, the field of web design remains unexplored despite its critical role in modern digital communication. The web design process is complex and often time-consuming, especially for those with limited expertise. In this paper, we introduce Web Rendering Parameters Generation (WebRPG), a new task that aims at autom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15502v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15502v1-abstract-full" style="display: none;"> In the era of content creation revolution propelled by advancements in generative models, the field of web design remains unexplored despite its critical role in modern digital communication. The web design process is complex and often time-consuming, especially for those with limited expertise. In this paper, we introduce Web Rendering Parameters Generation (WebRPG), a new task that aims at automating the generation for visual presentation of web pages based on their HTML code. WebRPG would contribute to a faster web development workflow. Since there is no existing benchmark available, we develop a new dataset for WebRPG through an automated pipeline. Moreover, we present baseline models, utilizing VAE to manage numerous elements and rendering parameters, along with custom HTML embedding for capturing essential semantic and hierarchical information from HTML. Extensive experiments, including customized quantitative evaluations for this specific task, are conducted to evaluate the quality of the generated results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15502v1-abstract-full').style.display = 'none'; document.getElementById('2407.15502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ECCV 2024. The dataset and code can be accessed at https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/DocumentUnderstanding/WebRPG</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13621">arXiv:2407.13621</a> <span> [<a href="https://arxiv.org/pdf/2407.13621">pdf</a>, <a href="https://arxiv.org/format/2407.13621">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Differential Privacy Mechanisms in Neural Tangent Kernel Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13621v2-abstract-short" style="display: inline;"> Training data privacy is a fundamental problem in modern Artificial Intelligence (AI) applications, such as face recognition, recommendation systems, language generation, and many others, as it may contain sensitive user information related to legal issues. To fundamentally understand how privacy mechanisms work in AI applications, we study differential privacy (DP) in the Neural Tangent Kernel (N… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13621v2-abstract-full').style.display = 'inline'; document.getElementById('2407.13621v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13621v2-abstract-full" style="display: none;"> Training data privacy is a fundamental problem in modern Artificial Intelligence (AI) applications, such as face recognition, recommendation systems, language generation, and many others, as it may contain sensitive user information related to legal issues. To fundamentally understand how privacy mechanisms work in AI applications, we study differential privacy (DP) in the Neural Tangent Kernel (NTK) regression setting, where DP is one of the most powerful tools for measuring privacy under statistical learning, and NTK is one of the most popular analysis frameworks for studying the learning mechanisms of deep neural networks. In our work, we can show provable guarantees for both differential privacy and test accuracy of our NTK regression. Furthermore, we conduct experiments on the basic image classification dataset CIFAR10 to demonstrate that NTK regression can preserve good accuracy under a modest privacy budget, supporting the validity of our analysis. To our knowledge, this is the first work to provide a DP guarantee for NTK regression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13621v2-abstract-full').style.display = 'none'; document.getElementById('2407.13621v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10430">arXiv:2407.10430</a> <span> [<a href="https://arxiv.org/pdf/2407.10430">pdf</a>, <a href="https://arxiv.org/format/2407.10430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Expanding the Scope: Inductive Knowledge Graph Reasoning with Multi-Starting Progressive Propagation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhoutian Shao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yuanning Cui</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wei Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10430v1-abstract-short" style="display: inline;"> Knowledge graphs (KGs) are widely acknowledged as incomplete, and new entities are constantly emerging in the real world. Inductive KG reasoning aims to predict missing facts for these new entities. Among existing models, graph neural networks (GNNs) based ones have shown promising performance for this task. However, they are still challenged by inefficient message propagation due to the distance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10430v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10430v1-abstract-full" style="display: none;"> Knowledge graphs (KGs) are widely acknowledged as incomplete, and new entities are constantly emerging in the real world. Inductive KG reasoning aims to predict missing facts for these new entities. Among existing models, graph neural networks (GNNs) based ones have shown promising performance for this task. However, they are still challenged by inefficient message propagation due to the distance and scalability issues. In this paper, we propose a new inductive KG reasoning model, MStar, by leveraging conditional message passing neural networks (C-MPNNs). Our key insight is to select multiple query-specific starting entities to expand the scope of progressive propagation. To propagate query-related messages to a farther area within limited steps, we subsequently design a highway layer to propagate information toward these selected starting entities. Moreover, we introduce a training strategy called LinkVerify to mitigate the impact of noisy training samples. Experimental results validate that MStar achieves superior performance compared with state-of-the-art models, especially for distant entities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10430v1-abstract-full').style.display = 'none'; document.getElementById('2407.10430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in the 23rd International Semantic Web Conference (ISWC 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09050">arXiv:2407.09050</a> <span> [<a href="https://arxiv.org/pdf/2407.09050">pdf</a>, <a href="https://arxiv.org/format/2407.09050">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Refusing Safe Prompts for Multi-modal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zedian Shao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongbin Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuepeng Hu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+N+Z">Neil Zhenqiang Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09050v2-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have become the cornerstone of today's generative AI ecosystem, sparking intense competition among tech giants and startups. In particular, an MLLM generates a text response given a prompt consisting of an image and a question. While state-of-the-art MLLMs use safety filters and alignment techniques to refuse unsafe prompts, in this work, we introduce MLLM-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09050v2-abstract-full').style.display = 'inline'; document.getElementById('2407.09050v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09050v2-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have become the cornerstone of today's generative AI ecosystem, sparking intense competition among tech giants and startups. In particular, an MLLM generates a text response given a prompt consisting of an image and a question. While state-of-the-art MLLMs use safety filters and alignment techniques to refuse unsafe prompts, in this work, we introduce MLLM-Refusal, the first method that induces refusals for safe prompts. In particular, our MLLM-Refusal optimizes a nearly-imperceptible refusal perturbation and adds it to an image, causing target MLLMs to likely refuse a safe prompt containing the perturbed image and a safe question. Specifically, we formulate MLLM-Refusal as a constrained optimization problem and propose an algorithm to solve it. Our method offers competitive advantages for MLLM model providers by potentially disrupting user experiences of competing MLLMs, since competing MLLM's users will receive unexpected refusals when they unwittingly use these perturbed images in their prompts. We evaluate MLLM-Refusal on four MLLMs across four datasets, demonstrating its effectiveness in causing competing MLLMs to refuse safe prompts while not affecting non-competing MLLMs. Furthermore, we explore three potential countermeasures-adding Gaussian noise, DiffPure, and adversarial training. Our results show that though they can mitigate MLLM-Refusal's effectiveness, they also sacrifice the accuracy and/or efficiency of the competing MLLM. The code is available at https://github.com/Sadcardation/MLLM-Refusal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09050v2-abstract-full').style.display = 'none'; document.getElementById('2407.09050v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19217">arXiv:2406.19217</a> <span> [<a href="https://arxiv.org/pdf/2406.19217">pdf</a>, <a href="https://arxiv.org/format/2406.19217">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LRA.2024.3495452">10.1109/LRA.2024.3495452 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Think Step by Step: Chain-of-Gesture Prompting for Error Detection in Robotic Surgical Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhimin Shao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jialang Xu</a>, <a href="/search/cs?searchtype=author&query=Stoyanov%2C+D">Danail Stoyanov</a>, <a href="/search/cs?searchtype=author&query=Mazomenos%2C+E+B">Evangelos B. Mazomenos</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yueming Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19217v1-abstract-short" style="display: inline;"> Despite significant advancements in robotic systems and surgical data science, ensuring safe and optimal execution in robot-assisted minimally invasive surgery (RMIS) remains a complex challenge. Current surgical error detection methods involve two parts: identifying surgical gestures and then detecting errors within each gesture clip. These methods seldom consider the rich contextual and semantic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19217v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19217v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19217v1-abstract-full" style="display: none;"> Despite significant advancements in robotic systems and surgical data science, ensuring safe and optimal execution in robot-assisted minimally invasive surgery (RMIS) remains a complex challenge. Current surgical error detection methods involve two parts: identifying surgical gestures and then detecting errors within each gesture clip. These methods seldom consider the rich contextual and semantic information inherent in surgical videos, limiting their performance due to reliance on accurate gesture identification. Motivated by the chain-of-thought prompting in natural language processing, this letter presents a novel and real-time end-to-end error detection framework, Chain-of-Thought (COG) prompting, leveraging contextual information from surgical videos. This encompasses two reasoning modules designed to mimic the decision-making processes of expert surgeons. Concretely, we first design a Gestural-Visual Reasoning module, which utilizes transformer and attention architectures for gesture prompting, while the second, a Multi-Scale Temporal Reasoning module, employs a multi-stage temporal convolutional network with both slow and fast paths for temporal information extraction. We extensively validate our method on the public benchmark RMIS dataset JIGSAWS. Our method encapsulates the reasoning processes inherent to surgical activities enabling it to outperform the state-of-the-art by 4.6% in F1 score, 4.6% in Accuracy, and 5.9% in Jaccard index while processing each frame in 6.69 milliseconds on average, demonstrating the great potential of our approach in enhancing the safety and efficacy of RMIS procedures and surgical education. The code will be available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19217v1-abstract-full').style.display = 'none'; document.getElementById('2406.19217v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Robotics and Automation Letters, vol. 9, no. 12, pp. 11513-11520, Dec. 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18001">arXiv:2406.18001</a> <span> [<a href="https://arxiv.org/pdf/2406.18001">pdf</a>, <a href="https://arxiv.org/format/2406.18001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Scalable Dual Coordinate Descent for Kernel Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zishan Shao</a>, <a href="/search/cs?searchtype=author&query=Devarakonda%2C+A">Aditya Devarakonda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18001v1-abstract-short" style="display: inline;"> Dual Coordinate Descent (DCD) and Block Dual Coordinate Descent (BDCD) are important iterative methods for solving convex optimization problems. In this work, we develop scalable DCD and BDCD methods for the kernel support vector machines (K-SVM) and kernel ridge regression (K-RR) problems. On distributed-memory parallel machines the scalability of these methods is limited by the need to communica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18001v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18001v1-abstract-full" style="display: none;"> Dual Coordinate Descent (DCD) and Block Dual Coordinate Descent (BDCD) are important iterative methods for solving convex optimization problems. In this work, we develop scalable DCD and BDCD methods for the kernel support vector machines (K-SVM) and kernel ridge regression (K-RR) problems. On distributed-memory parallel machines the scalability of these methods is limited by the need to communicate every iteration. On modern hardware where communication is orders of magnitude more expensive, the running time of the DCD and BDCD methods is dominated by communication cost. We address this communication bottleneck by deriving $s$-step variants of DCD and BDCD for solving the K-SVM and K-RR problems, respectively. The $s$-step variants reduce the frequency of communication by a tunable factor of $s$ at the expense of additional bandwidth and computation. The $s$-step variants compute the same solution as the existing methods in exact arithmetic. We perform numerical experiments to illustrate that the $s$-step variants are also numerically stable in finite-arithmetic, even for large values of $s$. We perform theoretical analysis to bound the computation and communication costs of the newly designed variants, up to leading order. Finally, we develop high performance implementations written in C and MPI and present scaling experiments performed on a Cray EX cluster. The new $s$-step variants achieved strong scaling speedups of up to $9.8\times$ over existing methods using up to $512$ cores. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18001v1-abstract-full').style.display = 'none'; document.getElementById('2406.18001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 65Y05 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.1.3; G.4; F.2.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16675">arXiv:2406.16675</a> <span> [<a href="https://arxiv.org/pdf/2406.16675">pdf</a>, <a href="https://arxiv.org/ps/2406.16675">ps</a>, <a href="https://arxiv.org/format/2406.16675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Decentralized and Centralized IDD Schemes for Cell-Free Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ssettumba%2C+T">T. Ssettumba</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Z. Shao</a>, <a href="/search/cs?searchtype=author&query=Landau%2C+L">L. Landau</a>, <a href="/search/cs?searchtype=author&query=de+Lamare%2C+R">R. de Lamare</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16675v1-abstract-short" style="display: inline;"> In this paper, we propose iterative interference cancellation schemes with access points selection (APs-Sel) for cell-free massive multiple-input multiple-output (CF-mMIMO) systems. Closed-form expressions for centralized and decentralized linear minimum mean square error (LMMSE) receive filters with APs-Sel are derived assuming imperfect channel state information (CSI). Furthermore, we develop a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16675v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16675v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16675v1-abstract-full" style="display: none;"> In this paper, we propose iterative interference cancellation schemes with access points selection (APs-Sel) for cell-free massive multiple-input multiple-output (CF-mMIMO) systems. Closed-form expressions for centralized and decentralized linear minimum mean square error (LMMSE) receive filters with APs-Sel are derived assuming imperfect channel state information (CSI). Furthermore, we develop a list-based detector based on LMMSE receive filters that exploits interference cancellation and the constellation points. A message-passing-based iterative detection and decoding (IDD) scheme that employs low-density parity-check (LDPC) codes is then developed. Moreover, log-likelihood ratio (LLR) refinement strategies based on censoring and a linear combination of local LLRs are proposed to improve the network performance. We compare the cases with centralized and decentralized processing in terms of bit error rate (BER) performance, complexity, and signaling under perfect CSI (PCSI) and imperfect CSI (ICSI) and verify the superiority of the distributed architecture with LLR refinements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16675v1-abstract-full').style.display = 'none'; document.getElementById('2406.16675v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16006">arXiv:2406.16006</a> <span> [<a href="https://arxiv.org/pdf/2406.16006">pdf</a>, <a href="https://arxiv.org/format/2406.16006">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bounding-Box Inference for Error-Aware Model-Based Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Talvitie%2C+E+J">Erin J. Talvitie</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zilei Shao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huiying Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinghan Hu</a>, <a href="/search/cs?searchtype=author&query=Boerma%2C+J">Jacob Boerma</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rory Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xintong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16006v1-abstract-short" style="display: inline;"> In model-based reinforcement learning, simulated experiences from the learned model are often treated as equivalent to experience from the real environment. However, when the model is inaccurate, it can catastrophically interfere with policy learning. Alternatively, the agent might learn about the model's accuracy and selectively use it only when it can provide reliable predictions. We empirically… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16006v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16006v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16006v1-abstract-full" style="display: none;"> In model-based reinforcement learning, simulated experiences from the learned model are often treated as equivalent to experience from the real environment. However, when the model is inaccurate, it can catastrophically interfere with policy learning. Alternatively, the agent might learn about the model's accuracy and selectively use it only when it can provide reliable predictions. We empirically explore model uncertainty measures for selective planning and show that best results require distribution insensitive inference to estimate the uncertainty over model-based updates. To that end, we propose and evaluate bounding-box inference, which operates on bounding-boxes around sets of possible states and other quantities. We find that bounding-box inference can reliably support effective selective planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16006v1-abstract-full').style.display = 'none'; document.getElementById('2406.16006v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear: Reinforcement Learning Conference (RLC), 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12847">arXiv:2406.12847</a> <span> [<a href="https://arxiv.org/pdf/2406.12847">pdf</a>, <a href="https://arxiv.org/format/2406.12847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ChangeViT: Unleashing Plain Vision Transformers for Change Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Duowang Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaohu Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haiyan Huang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhenfeng Shao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Q">Qimin Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12847v1-abstract-short" style="display: inline;"> Change detection in remote sensing images is essential for tracking environmental changes on the Earth's surface. Despite the success of vision transformers (ViTs) as backbones in numerous computer vision applications, they remain underutilized in change detection, where convolutional neural networks (CNNs) continue to dominate due to their powerful feature extraction capabilities. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12847v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12847v1-abstract-full" style="display: none;"> Change detection in remote sensing images is essential for tracking environmental changes on the Earth's surface. Despite the success of vision transformers (ViTs) as backbones in numerous computer vision applications, they remain underutilized in change detection, where convolutional neural networks (CNNs) continue to dominate due to their powerful feature extraction capabilities. In this paper, our study uncovers ViTs' unique advantage in discerning large-scale changes, a capability where CNNs fall short. Capitalizing on this insight, we introduce ChangeViT, a framework that adopts a plain ViT backbone to enhance the performance of large-scale changes. This framework is supplemented by a detail-capture module that generates detailed spatial features and a feature injector that efficiently integrates fine-grained spatial information into high-level semantic learning. The feature integration ensures that ChangeViT excels in both detecting large-scale changes and capturing fine-grained details, providing comprehensive change detection across diverse scales. Without bells and whistles, ChangeViT achieves state-of-the-art performance on three popular high-resolution datasets (i.e., LEVIR-CD, WHU-CD, and CLCD) and one low-resolution dataset (i.e., OSCD), which underscores the unleashed potential of plain ViTs for change detection. Furthermore, thorough quantitative and qualitative analyses validate the efficacy of the introduced modules, solidifying the effectiveness of our approach. The source code is available at https://github.com/zhuduowang/ChangeViT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12847v1-abstract-full').style.display = 'none'; document.getElementById('2406.12847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Shao%2C+Z&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>