Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 3,266 results for author: <span class="mathjax">Chen, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chen%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chen, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chen%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chen, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14305">arXiv:2411.14305</a> <span> [<a href="https://arxiv.org/pdf/2411.14305">pdf</a>, <a href="https://arxiv.org/ps/2411.14305">ps</a>, <a href="https://arxiv.org/format/2411.14305">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Outlier-robust Mean Estimation near the Breakdown Point via Sum-of-Squares </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongjie Chen</a>, <a href="/search/cs?searchtype=author&query=Sridharan%2C+D+N">Deepak Narayanan Sridharan</a>, <a href="/search/cs?searchtype=author&query=Steurer%2C+D">David Steurer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14305v1-abstract-short" style="display: inline;"> We revisit the problem of estimating the mean of a high-dimensional distribution in the presence of an $\varepsilon$-fraction of adversarial outliers. When $\varepsilon$ is at most some sufficiently small constant, previous works can achieve optimal error rate efficiently \cite{diakonikolas2018robustly, kothari2018robust}. As $\varepsilon$ approaches the breakdown point $\frac{1}{2}$, all previo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14305v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14305v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14305v1-abstract-full" style="display: none;"> We revisit the problem of estimating the mean of a high-dimensional distribution in the presence of an $\varepsilon$-fraction of adversarial outliers. When $\varepsilon$ is at most some sufficiently small constant, previous works can achieve optimal error rate efficiently \cite{diakonikolas2018robustly, kothari2018robust}. As $\varepsilon$ approaches the breakdown point $\frac{1}{2}$, all previous algorithms incur either sub-optimal error rates or exponential running time. In this paper we give a new analysis of the canonical sum-of-squares program introduced in \cite{kothari2018robust} and show that this program efficiently achieves optimal error rate for all $\varepsilon \in[0,\frac{1}{2})$. The key ingredient for our results is a new identifiability proof for robust mean estimation that focuses on the overlap between the distributions instead of their statistical distance as in previous works. We capture this proof within the sum-of-squares proof system, thus obtaining efficient algorithms using the sum-of-squares proofs to algorithms paradigm \cite{raghavendra2018high}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14305v1-abstract-full').style.display = 'none'; document.getElementById('2411.14305v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at SODA 2025, 47 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12989">arXiv:2411.12989</a> <span> [<a href="https://arxiv.org/pdf/2411.12989">pdf</a>, <a href="https://arxiv.org/format/2411.12989">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Data Watermarking for Sequential Recommender Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sixiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+C">Cheng Long</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wei Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongxu Chen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongzhi Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12989v1-abstract-short" style="display: inline;"> In the era of large foundation models, data has become a crucial component for building high-performance AI systems. As the demand for high-quality and large-scale data continues to rise, data copyright protection is attracting increasing attention. In this work, we explore the problem of data watermarking for sequential recommender systems, where a watermark is embedded into the target dataset an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12989v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12989v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12989v1-abstract-full" style="display: none;"> In the era of large foundation models, data has become a crucial component for building high-performance AI systems. As the demand for high-quality and large-scale data continues to rise, data copyright protection is attracting increasing attention. In this work, we explore the problem of data watermarking for sequential recommender systems, where a watermark is embedded into the target dataset and can be detected in models trained on that dataset. We address two specific challenges: dataset watermarking, which protects the ownership of the entire dataset, and user watermarking, which safeguards the data of individual users. We systematically define these problems and present a method named DWRS to address them. Our approach involves randomly selecting unpopular items to create a watermark sequence, which is then inserted into normal users' interaction sequences. Extensive experiments on five representative sequential recommendation models and three benchmark datasets demonstrate the effectiveness of DWRS in protecting data copyright while preserving model utility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12989v1-abstract-full').style.display = 'none'; document.getElementById('2411.12989v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12981">arXiv:2411.12981</a> <span> [<a href="https://arxiv.org/pdf/2411.12981">pdf</a>, <a href="https://arxiv.org/format/2411.12981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GazeGaussian: High-Fidelity Gaze Redirection with 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xiaobao Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peng Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guangyu Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Ming Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Feng Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12981v1-abstract-short" style="display: inline;"> Gaze estimation encounters generalization challenges when dealing with out-of-distribution data. To address this problem, recent methods use neural radiance fields (NeRF) to generate augmented data. However, existing methods based on NeRF are computationally expensive and lack facial details. 3D Gaussian Splatting (3DGS) has become the prevailing representation of neural fields. While 3DGS has bee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12981v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12981v1-abstract-full" style="display: none;"> Gaze estimation encounters generalization challenges when dealing with out-of-distribution data. To address this problem, recent methods use neural radiance fields (NeRF) to generate augmented data. However, existing methods based on NeRF are computationally expensive and lack facial details. 3D Gaussian Splatting (3DGS) has become the prevailing representation of neural fields. While 3DGS has been extensively examined in head avatars, it faces challenges with accurate gaze control and generalization across different subjects. In this work, we propose GazeGaussian, a high-fidelity gaze redirection method that uses a two-stream 3DGS model to represent the face and eye regions separately. By leveraging the unstructured nature of 3DGS, we develop a novel eye representation for rigid eye rotation based on the target gaze direction. To enhance synthesis generalization across various subjects, we integrate an expression-conditional module to guide the neural renderer. Comprehensive experiments show that GazeGaussian outperforms existing methods in rendering speed, gaze redirection accuracy, and facial synthesis across multiple datasets. We also demonstrate that existing gaze estimation methods can leverage GazeGaussian to improve their generalization performance. The code will be available at: https://ucwxb.github.io/GazeGaussian/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12981v1-abstract-full').style.display = 'none'; document.getElementById('2411.12981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12824">arXiv:2411.12824</a> <span> [<a href="https://arxiv.org/pdf/2411.12824">pdf</a>, <a href="https://arxiv.org/format/2411.12824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generalized Prompt Tuning: Adapting Frozen Univariate Time Series Foundation Models for Multivariate Healthcare Time Series </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingzhu Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A+H">Angela H. Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G+H">George H. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12824v1-abstract-short" style="display: inline;"> Time series foundation models are pre-trained on large datasets and are able to achieve state-of-the-art performance in diverse tasks. However, to date, there has been limited work demonstrating how well these models perform in medical applications, where labeled data can be scarce. Further, we observe that currently, the majority of time series foundation models either are univariate in nature, o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12824v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12824v1-abstract-full" style="display: none;"> Time series foundation models are pre-trained on large datasets and are able to achieve state-of-the-art performance in diverse tasks. However, to date, there has been limited work demonstrating how well these models perform in medical applications, where labeled data can be scarce. Further, we observe that currently, the majority of time series foundation models either are univariate in nature, or assume channel independence, meaning that they handle multivariate time series but do not model how the different variables relate. In this paper, we propose a prompt-tuning-inspired fine-tuning technique, Generalized Prompt Tuning (Gen-P-Tuning), that enables us to adapt an existing univariate time series foundation model (treated as frozen) to handle multivariate time series prediction. Our approach provides a way to combine information across channels (variables) of multivariate time series. We demonstrate the effectiveness of our fine-tuning approach against various baselines on two MIMIC classification tasks, and on influenza-like illness forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12824v1-abstract-full').style.display = 'none'; document.getElementById('2411.12824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Machine Learning for Health (ML4H 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12440">arXiv:2411.12440</a> <span> [<a href="https://arxiv.org/pdf/2411.12440">pdf</a>, <a href="https://arxiv.org/format/2411.12440">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Gaussians: Fast and High-Fidelity 3D Splatting with Linear Kernels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haodong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Runnan Chen</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qiang Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaoqing Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoming Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y+Y">Yuk Ying Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12440v2-abstract-short" style="display: inline;"> Recent advancements in 3D Gaussian Splatting (3DGS) have substantially improved novel view synthesis, enabling high-quality reconstruction and real-time rendering. However, blurring artifacts, such as floating primitives and over-reconstruction, remain challenging. Current methods address these issues by refining scene structure, enhancing geometric representations, addressing blur in training ima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12440v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12440v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12440v2-abstract-full" style="display: none;"> Recent advancements in 3D Gaussian Splatting (3DGS) have substantially improved novel view synthesis, enabling high-quality reconstruction and real-time rendering. However, blurring artifacts, such as floating primitives and over-reconstruction, remain challenging. Current methods address these issues by refining scene structure, enhancing geometric representations, addressing blur in training images, improving rendering consistency, and optimizing density control, yet the role of kernel design remains underexplored. We identify the soft boundaries of Gaussian ellipsoids as one of the causes of these artifacts, limiting detail capture in high-frequency regions. To bridge this gap, we introduce 3D Linear Splatting (3DLS), which replaces Gaussian kernels with linear kernels to achieve sharper and more precise results, particularly in high-frequency regions. Through evaluations on three datasets, 3DLS demonstrates state-of-the-art fidelity and accuracy, along with a 30% FPS improvement over baseline 3DGS. The implementation will be made publicly available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12440v2-abstract-full').style.display = 'none'; document.getElementById('2411.12440v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12389">arXiv:2411.12389</a> <span> [<a href="https://arxiv.org/pdf/2411.12389">pdf</a>, <a href="https://arxiv.org/format/2411.12389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Combinational Backdoor Attack against Customized Text-to-Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wenbo Jiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiaming He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongwei Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guowen Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanxiao Chen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+M">Meng Hao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haomiao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12389v1-abstract-short" style="display: inline;"> Recently, Text-to-Image (T2I) synthesis technology has made tremendous strides. Numerous representative T2I models have emerged and achieved promising application outcomes, such as DALL-E, Stable Diffusion, Imagen, etc. In practice, it has become increasingly popular for model developers to selectively adopt various pre-trained text encoders and conditional diffusion models from third-party platfo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12389v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12389v1-abstract-full" style="display: none;"> Recently, Text-to-Image (T2I) synthesis technology has made tremendous strides. Numerous representative T2I models have emerged and achieved promising application outcomes, such as DALL-E, Stable Diffusion, Imagen, etc. In practice, it has become increasingly popular for model developers to selectively adopt various pre-trained text encoders and conditional diffusion models from third-party platforms, integrating them to build customized (personalized) T2I models. However, such an adoption approach is vulnerable to backdoor attacks. In this work, we propose a Combinational Backdoor Attack against Customized T2I models (CBACT2I) targeting this application scenario. Different from previous backdoor attacks against T2I models, CBACT2I embeds the backdoor into the text encoder and the conditional diffusion model separately. The customized T2I model exhibits backdoor behaviors only when the backdoor text encoder is used in combination with the backdoor conditional diffusion model. These properties make CBACT2I more stealthy and flexible than prior backdoor attacks against T2I models. Extensive experiments demonstrate the effectiveness of CBACT2I with different backdoor triggers and different backdoor targets on the open-sourced Stable Diffusion model. This work reveals the backdoor vulnerabilities of customized T2I models and urges countermeasures to mitigate backdoor threats in this scenario. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12389v1-abstract-full').style.display = 'none'; document.getElementById('2411.12389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12275">arXiv:2411.12275</a> <span> [<a href="https://arxiv.org/pdf/2411.12275">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Building Trust: Foundations of Security, Safety and Transparency in AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sidhpurwala%2C+H">Huzaifa Sidhpurwala</a>, <a href="/search/cs?searchtype=author&query=Mollett%2C+G">Garth Mollett</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+E">Emily Fox</a>, <a href="/search/cs?searchtype=author&query=Bestavros%2C+M">Mark Bestavros</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huamin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12275v1-abstract-short" style="display: inline;"> This paper explores the rapidly evolving ecosystem of publicly available AI models, and their potential implications on the security and safety landscape. As AI models become increasingly prevalent, understanding their potential risks and vulnerabilities is crucial. We review the current security and safety scenarios while highlighting challenges such as tracking issues, remediation, and the appar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12275v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12275v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12275v1-abstract-full" style="display: none;"> This paper explores the rapidly evolving ecosystem of publicly available AI models, and their potential implications on the security and safety landscape. As AI models become increasingly prevalent, understanding their potential risks and vulnerabilities is crucial. We review the current security and safety scenarios while highlighting challenges such as tracking issues, remediation, and the apparent absence of AI model lifecycle and ownership processes. Comprehensive strategies to enhance security and safety for both model developers and end-users are proposed. This paper aims to provide some of the foundational pieces for more standardized security, safety, and transparency in the development and operation of AI models and the larger open ecosystems and communities forming around them. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12275v1-abstract-full').style.display = 'none'; document.getElementById('2411.12275v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12036">arXiv:2411.12036</a> <span> [<a href="https://arxiv.org/pdf/2411.12036">pdf</a>, <a href="https://arxiv.org/format/2411.12036">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Econometrics">econ.EM</span> </div> </div> <p class="title is-5 mathjax"> Prediction-Guided Active Experiments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ao%2C+R">Ruicheng Ao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Simchi-Levi%2C+D">David Simchi-Levi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12036v2-abstract-short" style="display: inline;"> In this work, we introduce a new framework for active experimentation, the Prediction-Guided Active Experiment (PGAE), which leverages predictions from an existing machine learning model to guide sampling and experimentation. Specifically, at each time step, an experimental unit is sampled according to a designated sampling distribution, and the actual outcome is observed based on an experimental… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12036v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12036v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12036v2-abstract-full" style="display: none;"> In this work, we introduce a new framework for active experimentation, the Prediction-Guided Active Experiment (PGAE), which leverages predictions from an existing machine learning model to guide sampling and experimentation. Specifically, at each time step, an experimental unit is sampled according to a designated sampling distribution, and the actual outcome is observed based on an experimental probability. Otherwise, only a prediction for the outcome is available. We begin by analyzing the non-adaptive case, where full information on the joint distribution of the predictor and the actual outcome is assumed. For this scenario, we derive an optimal experimentation strategy by minimizing the semi-parametric efficiency bound for the class of regular estimators. We then introduce an estimator that meets this efficiency bound, achieving asymptotic optimality. Next, we move to the adaptive case, where the predictor is continuously updated with newly sampled data. We show that the adaptive version of the estimator remains efficient and attains the same semi-parametric bound under certain regularity assumptions. Finally, we validate PGAE's performance through simulations and a semi-synthetic experiment using data from the US Census Bureau. The results underscore the PGAE framework's effectiveness and superiority compared to other existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12036v2-abstract-full').style.display = 'none'; document.getElementById('2411.12036v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11717">arXiv:2411.11717</a> <span> [<a href="https://arxiv.org/pdf/2411.11717">pdf</a>, <a href="https://arxiv.org/format/2411.11717">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RAWMamba: Unified sRGB-to-RAW De-rendering With State Space Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongjun Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Wencheng Han</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Huan Zheng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jianbing Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11717v1-abstract-short" style="display: inline;"> Recent advancements in sRGB-to-RAW de-rendering have increasingly emphasized metadata-driven approaches to reconstruct RAW data from sRGB images, supplemented by partial RAW information. In image-based de-rendering, metadata is commonly obtained through sampling, whereas in video tasks, it is typically derived from the initial frame. The distinct metadata requirements necessitate specialized netwo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11717v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11717v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11717v1-abstract-full" style="display: none;"> Recent advancements in sRGB-to-RAW de-rendering have increasingly emphasized metadata-driven approaches to reconstruct RAW data from sRGB images, supplemented by partial RAW information. In image-based de-rendering, metadata is commonly obtained through sampling, whereas in video tasks, it is typically derived from the initial frame. The distinct metadata requirements necessitate specialized network architectures, leading to architectural incompatibilities that increase deployment complexity. In this paper, we propose RAWMamba, a Mamba-based unified framework developed for sRGB-to-RAW de-rendering across both image and video domains. The core of RAWMamba is the Unified Metadata Embedding (UME) module, which harmonizes diverse metadata types into a unified representation. In detail, a multi-perspective affinity modeling method is proposed to promote the extraction of reference information. In addition, we introduce the Local Tone-Aware Mamba (LTA-Mamba) module, which captures long-range dependencies to enable effective global propagation of metadata. Experimental results demonstrate that the proposed RAWMamba achieves state-of-the-art performance, yielding high-quality RAW data reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11717v1-abstract-full').style.display = 'none'; document.getElementById('2411.11717v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11659">arXiv:2411.11659</a> <span> [<a href="https://arxiv.org/pdf/2411.11659">pdf</a>, <a href="https://arxiv.org/ps/2411.11659">ps</a>, <a href="https://arxiv.org/format/2411.11659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Improving Data Curation of Software Vulnerability Patches through Uncertainty Quantification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunhua Zhao</a>, <a href="/search/cs?searchtype=author&query=Damevski%2C+K">Kostadin Damevski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11659v1-abstract-short" style="display: inline;"> The changesets (or patches) that fix open source software vulnerabilities form critical datasets for various machine learning security-enhancing applications, such as automated vulnerability patching and silent fix detection. These patch datasets are derived from extensive collections of historical vulnerability fixes, maintained in databases like the Common Vulnerabilities and Exposures list and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11659v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11659v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11659v1-abstract-full" style="display: none;"> The changesets (or patches) that fix open source software vulnerabilities form critical datasets for various machine learning security-enhancing applications, such as automated vulnerability patching and silent fix detection. These patch datasets are derived from extensive collections of historical vulnerability fixes, maintained in databases like the Common Vulnerabilities and Exposures list and the National Vulnerability Database. However, since these databases focus on rapid notification to the security community, they contain significant inaccuracies and omissions that have a negative impact on downstream software security quality assurance tasks. In this paper, we propose an approach employing Uncertainty Quantification (UQ) to curate datasets of publicly-available software vulnerability patches. Our methodology leverages machine learning models that incorporate UQ to differentiate between patches based on their potential utility. We begin by evaluating a number of popular UQ techniques, including Vanilla, Monte Carlo Dropout, and Model Ensemble, as well as homoscedastic and heteroscedastic models of noise. Our findings indicate that Model Ensemble and heteroscedastic models are the best choices for vulnerability patch datasets. Based on these UQ modeling choices, we propose a heuristic that uses UQ to filter out lower quality instances and select instances with high utility value from the vulnerability dataset. Using our approach, we observe an improvement in predictive performance and significant reduction of model training time (i.e., energy consumption) for a state-of-the-art vulnerability prediction model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11659v1-abstract-full').style.display = 'none'; document.getElementById('2411.11659v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11641">arXiv:2411.11641</a> <span> [<a href="https://arxiv.org/pdf/2411.11641">pdf</a>, <a href="https://arxiv.org/format/2411.11641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TSINR: Capturing Temporal Continuity via Implicit Neural Representations for Time Series Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Mengxuan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Ke Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyang Chen</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiajun Bu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongwei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haishuai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11641v2-abstract-short" style="display: inline;"> Time series anomaly detection aims to identify unusual patterns in data or deviations from systems' expected behavior. The reconstruction-based methods are the mainstream in this task, which learn point-wise representation via unsupervised learning. However, the unlabeled anomaly points in training data may cause these reconstruction-based methods to learn and reconstruct anomalous data, resulting… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11641v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11641v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11641v2-abstract-full" style="display: none;"> Time series anomaly detection aims to identify unusual patterns in data or deviations from systems' expected behavior. The reconstruction-based methods are the mainstream in this task, which learn point-wise representation via unsupervised learning. However, the unlabeled anomaly points in training data may cause these reconstruction-based methods to learn and reconstruct anomalous data, resulting in the challenge of capturing normal patterns. In this paper, we propose a time series anomaly detection method based on implicit neural representation (INR) reconstruction, named TSINR, to address this challenge. Due to the property of spectral bias, TSINR enables prioritizing low-frequency signals and exhibiting poorer performance on high-frequency abnormal data. Specifically, we adopt INR to parameterize time series data as a continuous function and employ a transformer-based architecture to predict the INR of given data. As a result, the proposed TSINR method achieves the advantage of capturing the temporal continuity and thus is more sensitive to discontinuous anomaly data. In addition, we further design a novel form of INR continuous function to learn inter- and intra-channel information, and leverage a pre-trained large language model to amplify the intense fluctuations in anomalies. Extensive experiments demonstrate that TSINR achieves superior overall performance on both univariate and multivariate time series anomaly detection benchmarks compared to other state-of-the-art reconstruction-based methods. Our codes are available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11641v2-abstract-full').style.display = 'none'; document.getElementById('2411.11641v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SIGKDD 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11487">arXiv:2411.11487</a> <span> [<a href="https://arxiv.org/pdf/2411.11487">pdf</a>, <a href="https://arxiv.org/format/2411.11487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Look a Group at Once: Multi-Slide Modeling for Survival Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yi Xie</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianfei Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haixian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11487v1-abstract-short" style="display: inline;"> Survival prediction is a critical task in pathology. In clinical practice, pathologists often examine multiple cases, leveraging a broader spectrum of cancer phenotypes to enhance pathological assessment. Despite significant advancements in deep learning, current solutions typically model each slide as a sample, struggling to effectively capture comparable and slide-agnostic pathological features.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11487v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11487v1-abstract-full" style="display: none;"> Survival prediction is a critical task in pathology. In clinical practice, pathologists often examine multiple cases, leveraging a broader spectrum of cancer phenotypes to enhance pathological assessment. Despite significant advancements in deep learning, current solutions typically model each slide as a sample, struggling to effectively capture comparable and slide-agnostic pathological features. In this paper, we introduce GroupMIL, a novel framework inspired by the clinical practice of collective analysis, which models multiple slides as a single sample and organizes groups of patches and slides sequentially to capture cross-slide prognostic features. We also present GPAMamba, a model designed to facilitate intra- and inter-slide feature interactions, effectively capturing local micro-environmental characteristics within slide-level graphs while uncovering essential prognostic patterns across an extended patch sequence within the group framework. Furthermore, we develop a dual-head predictor that delivers comprehensive survival risk and probability assessments for each patient. Extensive empirical evaluations demonstrate that our model significantly outperforms state-of-the-art approaches across five datasets from The Cancer Genome Atlas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11487v1-abstract-full').style.display = 'none'; document.getElementById('2411.11487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11435">arXiv:2411.11435</a> <span> [<a href="https://arxiv.org/pdf/2411.11435">pdf</a>, <a href="https://arxiv.org/format/2411.11435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GLDesigner: Leveraging Multi-Modal LLMs as Designer for Enhanced Aesthetic Text Glyph Layouts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Junwen He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijun Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jun-Yan He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenyang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jin-Peng Lan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bin Luo</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Y">Yifeng Geng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11435v1-abstract-short" style="display: inline;"> Text logo design heavily relies on the creativity and expertise of professional designers, in which arranging element layouts is one of the most important procedures. However, few attention has been paid to this specific task which needs to take precise textural details and user constraints into consideration, but only on the broader tasks such as document/poster layout generation. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11435v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11435v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11435v1-abstract-full" style="display: none;"> Text logo design heavily relies on the creativity and expertise of professional designers, in which arranging element layouts is one of the most important procedures. However, few attention has been paid to this specific task which needs to take precise textural details and user constraints into consideration, but only on the broader tasks such as document/poster layout generation. In this paper, we propose a VLM-based framework that generates content-aware text logo layouts by integrating multi-modal inputs with user constraints, supporting a more flexible and stable layout design in real-world applications. We introduce two model techniques to reduce the computation for processing multiple glyph images simultaneously, while does not face performance degradation. To support instruction-tuning of out model, we construct two extensive text logo datasets, which are 5x more larger than the existing public dataset. Except for the geometric annotations (e.g. text masks and character recognition), we also compliment with comprehensive layout descriptions in natural language format, for more effective training to have reasoning ability when dealing with complex layouts and custom user constraints. Experimental studies demonstrate the effectiveness of our proposed model and datasets, when comparing with previous methods in various benchmarks to evaluate geometric aesthetics and human preferences. The code and datasets will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11435v1-abstract-full').style.display = 'none'; document.getElementById('2411.11435v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11305">arXiv:2411.11305</a> <span> [<a href="https://arxiv.org/pdf/2411.11305">pdf</a>, <a href="https://arxiv.org/ps/2411.11305">ps</a>, <a href="https://arxiv.org/format/2411.11305">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TP-UNet: Temporal Prompt Guided UNet for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ranmin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+L">Limin Zhuang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongkun Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Boyan Xu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+R">Ruichu Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11305v2-abstract-short" style="display: inline;"> The advancement of medical image segmentation techniques has been propelled by the adoption of deep learning techniques, particularly UNet-based approaches, which exploit semantic information to improve the accuracy of segmentations. However, the order of organs in scanned images has been disregarded by current medical image segmentation approaches based on UNet. Furthermore, the inherent network… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11305v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11305v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11305v2-abstract-full" style="display: none;"> The advancement of medical image segmentation techniques has been propelled by the adoption of deep learning techniques, particularly UNet-based approaches, which exploit semantic information to improve the accuracy of segmentations. However, the order of organs in scanned images has been disregarded by current medical image segmentation approaches based on UNet. Furthermore, the inherent network structure of UNet does not provide direct capabilities for integrating temporal information. To efficiently integrate temporal information, we propose TP-UNet that utilizes temporal prompts, encompassing organ-construction relationships, to guide the segmentation UNet model. Specifically, our framework is featured with cross-attention and semantic alignment based on unsupervised contrastive learning to combine temporal prompts and image features effectively. Extensive evaluations on two medical image segmentation datasets demonstrate the state-of-the-art performance of TP-UNet. Our implementation will be open-sourced after acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11305v2-abstract-full').style.display = 'none'; document.getElementById('2411.11305v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11223">arXiv:2411.11223</a> <span> [<a href="https://arxiv.org/pdf/2411.11223">pdf</a>, <a href="https://arxiv.org/format/2411.11223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Transfer Learning for Video-language Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haoxing Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zizheng Huang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yan Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanshuo Wang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Z">Zhongcai Lyu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhuoer Xu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jun Lan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Z">Zhangxuan Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11223v1-abstract-short" style="display: inline;"> Pre-trained vision-language models provide a robust foundation for efficient transfer learning across various downstream tasks. In the field of video action recognition, mainstream approaches often introduce additional parameter modules to capture temporal information. While the increased model capacity brought by these additional parameters helps better fit the video-specific inductive biases, ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11223v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11223v1-abstract-full" style="display: none;"> Pre-trained vision-language models provide a robust foundation for efficient transfer learning across various downstream tasks. In the field of video action recognition, mainstream approaches often introduce additional parameter modules to capture temporal information. While the increased model capacity brought by these additional parameters helps better fit the video-specific inductive biases, existing methods require learning a large number of parameters and are prone to catastrophic forgetting of the original generalizable knowledge. In this paper, we propose a simple yet effective Multi-modal Spatio-Temporal Adapter (MSTA) to improve the alignment between representations in the text and vision branches, achieving a balance between general knowledge and task-specific knowledge. Furthermore, to mitigate over-fitting and enhance generalizability, we introduce a spatio-temporal description-guided consistency constraint. This constraint involves feeding template inputs (i.e., ``a video of $\{\textbf{cls}\}$'') into the trainable language branch, while LLM-generated spatio-temporal descriptions are input into the pre-trained language branch, enforcing consistency between the outputs of the two branches. This mechanism prevents over-fitting to downstream tasks and improves the distinguishability of the trainable branch within the spatio-temporal semantic space. We evaluate the effectiveness of our approach across four tasks: zero-shot transfer, few-shot learning, base-to-novel generalization, and fully-supervised learning. Compared to many state-of-the-art methods, our MSTA achieves outstanding performance across all evaluations, while using only 2-7\% of the trainable parameters in the original model. Code will be avaliable at https://github.com/chenhaoxing/ETL4Video. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11223v1-abstract-full').style.display = 'none'; document.getElementById('2411.11223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10681">arXiv:2411.10681</a> <span> [<a href="https://arxiv.org/pdf/2411.10681">pdf</a>, <a href="https://arxiv.org/format/2411.10681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Structured Dialogue System for Mental Health: An LLM Chatbot Leveraging the PM+ Guidelines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yixiang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinran Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xurong Xie</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+N">Nan Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10681v1-abstract-short" style="display: inline;"> The Structured Dialogue System, referred to as SuDoSys, is an innovative Large Language Model (LLM)-based chatbot designed to provide psychological counseling. SuDoSys leverages the World Health Organization (WHO)'s Problem Management Plus (PM+) guidelines to deliver stage-aware multi-turn dialogues. Existing methods for employing an LLM in multi-turn psychological counseling typically involve dir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10681v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10681v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10681v1-abstract-full" style="display: none;"> The Structured Dialogue System, referred to as SuDoSys, is an innovative Large Language Model (LLM)-based chatbot designed to provide psychological counseling. SuDoSys leverages the World Health Organization (WHO)'s Problem Management Plus (PM+) guidelines to deliver stage-aware multi-turn dialogues. Existing methods for employing an LLM in multi-turn psychological counseling typically involve direct fine-tuning using generated dialogues, often neglecting the dynamic stage shifts of counseling sessions. Unlike previous approaches, SuDoSys considers the different stages of counseling and stores essential information throughout the counseling process, ensuring coherent and directed conversations. The system employs an LLM, a stage-aware instruction generator, a response unpacker, a topic database, and a stage controller to maintain dialogue flow. In addition, we propose a novel technique that simulates counseling clients to interact with the evaluated system and evaluate its performance automatically. When assessed using both objective and subjective evaluations, SuDoSys demonstrates its effectiveness in generating logically coherent responses. The system's code and program scripts for evaluation are open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10681v1-abstract-full').style.display = 'none'; document.getElementById('2411.10681v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the 16th International Conference on Social Robotic (ICSR 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10666">arXiv:2411.10666</a> <span> [<a href="https://arxiv.org/pdf/2411.10666">pdf</a>, <a href="https://arxiv.org/format/2411.10666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SAM Decoding: Speculative Decoding via Suffix Automaton </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuxuan Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cuiping Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10666v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have revolutionized natural language processing by unifying tasks into text generation, yet their large parameter sizes and autoregressive nature limit inference speed. SAM-Decoding addresses this by introducing a novel retrieval-based speculative decoding method that uses a suffix automaton for efficient and accurate draft generation. Unlike n-gram matching used by th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10666v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10666v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10666v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have revolutionized natural language processing by unifying tasks into text generation, yet their large parameter sizes and autoregressive nature limit inference speed. SAM-Decoding addresses this by introducing a novel retrieval-based speculative decoding method that uses a suffix automaton for efficient and accurate draft generation. Unlike n-gram matching used by the existing method, SAM-Decoding finds the longest suffix match in generating text and text corpuss, achieving an average time complexity of $O(1)$ per generation step. SAM-Decoding constructs static and dynamic suffix automatons for the text corpus and input prompts, respectively, enabling fast and precise draft generation. Meanwhile, it is designed as an approach that can be combined with existing methods, allowing SAM-Decoding to adaptively select a draft generation strategy based on the matching length, thus increasing the inference speed of the LLM. When combined with Token Recycling, evaluations show SAM-Decoding outperforms existing model-free methods, achieving a speedup of $2.27\times$ over autoregressive decoding on Spec-Bench. When combined with EAGLE2, it reaches a speedup of $2.49\times$, surpassing all current approaches. Our code is available at https://github.com/hyx1999/SAM-Decoding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10666v1-abstract-full').style.display = 'none'; document.getElementById('2411.10666v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 3 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10495">arXiv:2411.10495</a> <span> [<a href="https://arxiv.org/pdf/2411.10495">pdf</a>, <a href="https://arxiv.org/format/2411.10495">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Boundary Attention Constrained Zero-Shot Layout-To-Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huancheng Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingtao Li</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+W">Weiming Zhuang</a>, <a href="/search/cs?searchtype=author&query=Vikalo%2C+H">Haris Vikalo</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+L">Lingjuan Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10495v1-abstract-short" style="display: inline;"> Recent text-to-image diffusion models excel at generating high-resolution images from text but struggle with precise control over spatial composition and object counting. To address these challenges, several studies developed layout-to-image (L2I) approaches that incorporate layout instructions into text-to-image models. However, existing L2I methods typically require either fine-tuning pretrained… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10495v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10495v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10495v1-abstract-full" style="display: none;"> Recent text-to-image diffusion models excel at generating high-resolution images from text but struggle with precise control over spatial composition and object counting. To address these challenges, several studies developed layout-to-image (L2I) approaches that incorporate layout instructions into text-to-image models. However, existing L2I methods typically require either fine-tuning pretrained parameters or training additional control modules for the diffusion models. In this work, we propose a novel zero-shot L2I approach, BACON (Boundary Attention Constrained generation), which eliminates the need for additional modules or fine-tuning. Specifically, we use text-visual cross-attention feature maps to quantify inconsistencies between the layout of the generated images and the provided instructions, and then compute loss functions to optimize latent features during the diffusion reverse process. To enhance spatial controllability and mitigate semantic failures in complex layout instructions, we leverage pixel-to-pixel correlations in the self-attention feature maps to align cross-attention maps and combine three loss functions constrained by boundary attention to update latent features. Comprehensive experimental results on both L2I and non-L2I pretrained diffusion models demonstrate that our method outperforms existing zero-shot L2I techniuqes both quantitatively and qualitatively in terms of image composition on the DrawBench and HRS benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10495v1-abstract-full').style.display = 'none'; document.getElementById('2411.10495v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10364">arXiv:2411.10364</a> <span> [<a href="https://arxiv.org/pdf/2411.10364">pdf</a>, <a href="https://arxiv.org/format/2411.10364">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Forming Auxiliary High-confident Instance-level Loss to Promote Learning from Label Proportions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tianhao Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Han Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Juncheng Hu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yungang Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Ximing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10364v1-abstract-short" style="display: inline;"> Learning from label proportions (LLP), i.e., a challenging weakly-supervised learning task, aims to train a classifier by using bags of instances and the proportions of classes within bags, rather than annotated labels for each instance. Beyond the traditional bag-level loss, the mainstream methodology of LLP is to incorporate an auxiliary instance-level loss with pseudo-labels formed by predictio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10364v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10364v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10364v1-abstract-full" style="display: none;"> Learning from label proportions (LLP), i.e., a challenging weakly-supervised learning task, aims to train a classifier by using bags of instances and the proportions of classes within bags, rather than annotated labels for each instance. Beyond the traditional bag-level loss, the mainstream methodology of LLP is to incorporate an auxiliary instance-level loss with pseudo-labels formed by predictions. Unfortunately, we empirically observed that the pseudo-labels are are often inaccurate due to over-smoothing, especially for the scenarios with large bag sizes, hurting the classifier induction. To alleviate this problem, we suggest a novel LLP method, namely Learning from Label Proportions with Auxiliary High-confident Instance-level Loss (L^2P-AHIL). Specifically, we propose a dual entropy-based weight (DEW) method to adaptively measure the confidences of pseudo-labels. It simultaneously emphasizes accurate predictions at the bag level and avoids overly smoothed predictions. We then form high-confident instance-level loss with DEW, and jointly optimize it with the bag-level loss in a self-training manner. The experimental results on benchmark datasets show that L^2P-AHIL can surpass the existing baseline methods, and the performance gain can be more significant as the bag size increases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10364v1-abstract-full').style.display = 'none'; document.getElementById('2411.10364v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10272">arXiv:2411.10272</a> <span> [<a href="https://arxiv.org/pdf/2411.10272">pdf</a>, <a href="https://arxiv.org/format/2411.10272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scaling Law for Post-training after Model Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaodong Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuxuan Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cuiping Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10272v1-abstract-short" style="display: inline;"> Large language models (LLMs) based on the Transformer architecture are widely employed across various domains and tasks. However, their increasing size imposes significant hardware demands, limiting practical deployment. To mitigate this, model pruning techniques have been developed to create more efficient models while maintaining high performance. Despite this, post-training after pruning is cru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10272v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10272v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10272v1-abstract-full" style="display: none;"> Large language models (LLMs) based on the Transformer architecture are widely employed across various domains and tasks. However, their increasing size imposes significant hardware demands, limiting practical deployment. To mitigate this, model pruning techniques have been developed to create more efficient models while maintaining high performance. Despite this, post-training after pruning is crucial for performance recovery and can be resource-intensive. This paper investigates the post-training requirements of pruned LLMs and introduces a scaling law to determine the optimal amount of post-training data. Post-training experiments with the Llama-3 and Qwen-2.5 series models, pruned using depth pruning, width pruning, and 2:4 semi-structured pruning, show that higher pruning ratios necessitate more post-training data for performance recovery, whereas larger LLMs require less. The proposed scaling law predicts a model's loss based on its parameter counts before and after pruning, as well as the post-training token counts. Furthermore, we find that the scaling law established from smaller LLMs can be reliably extrapolated to larger LLMs. This work provides valuable insights into the post-training of pruned LLMs and offers a practical scaling law for optimizing post-training data usage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10272v1-abstract-full').style.display = 'none'; document.getElementById('2411.10272v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10229">arXiv:2411.10229</a> <span> [<a href="https://arxiv.org/pdf/2411.10229">pdf</a>, <a href="https://arxiv.org/format/2411.10229">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Optimally Rewriting Formulas and Database Queries: A Confluence of Term Rewriting, Structural Decomposition, and Complexity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hubie Chen</a>, <a href="/search/cs?searchtype=author&query=Mengel%2C+S">Stefan Mengel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10229v1-abstract-short" style="display: inline;"> A central computational task in database theory, finite model theory, and computer science at large is the evaluation of a first-order sentence on a finite structure. In the context of this task, the \emph{width} of a sentence, defined as the maximum number of free variables over all subformulas, has been established as a crucial measure, where minimizing width of a sentence (while retaining logic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10229v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10229v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10229v1-abstract-full" style="display: none;"> A central computational task in database theory, finite model theory, and computer science at large is the evaluation of a first-order sentence on a finite structure. In the context of this task, the \emph{width} of a sentence, defined as the maximum number of free variables over all subformulas, has been established as a crucial measure, where minimizing width of a sentence (while retaining logical equivalence) is considered highly desirable. An undecidability result rules out the possibility of an algorithm that, given a first-order sentence, returns a logically equivalent sentence of minimum width; this result motivates the study of width minimization via syntactic rewriting rules, which is this article's focus. For a number of common rewriting rules (which are known to preserve logical equivalence), including rules that allow for the movement of quantifiers, we present an algorithm that, given a positive first-order sentence $蠁$, outputs the minimum-width sentence obtainable from $蠁$ via application of these rules. We thus obtain a complete algorithmic understanding of width minimization up to the studied rules; this result is the first one -- of which we are aware -- that establishes this type of understanding in such a general setting. Our result builds on the theory of term rewriting and establishes an interface among this theory, query evaluation, and structural decomposition theory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10229v1-abstract-full').style.display = 'none'; document.getElementById('2411.10229v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09344">arXiv:2411.09344</a> <span> [<a href="https://arxiv.org/pdf/2411.09344">pdf</a>, <a href="https://arxiv.org/format/2411.09344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adaptively Augmented Consistency Learning: A Semi-supervised Segmentation Framework for Remote Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hui Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haodong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoming Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+V">Vera Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09344v1-abstract-short" style="display: inline;"> Remote sensing (RS) involves the acquisition of data about objects or areas from a distance, primarily to monitor environmental changes, manage resources, and support planning and disaster response. A significant challenge in RS segmentation is the scarcity of high-quality labeled images due to the diversity and complexity of RS image, which makes pixel-level annotation difficult and hinders the d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09344v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09344v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09344v1-abstract-full" style="display: none;"> Remote sensing (RS) involves the acquisition of data about objects or areas from a distance, primarily to monitor environmental changes, manage resources, and support planning and disaster response. A significant challenge in RS segmentation is the scarcity of high-quality labeled images due to the diversity and complexity of RS image, which makes pixel-level annotation difficult and hinders the development of effective supervised segmentation algorithms. To solve this problem, we propose Adaptively Augmented Consistency Learning (AACL), a semi-supervised segmentation framework designed to enhances RS segmentation accuracy under condictions of limited labeled data. AACL extracts additional information embedded in unlabeled images through the use of Uniform Strength Augmentation (USAug) and Adaptive Cut-Mix (AdaCM). Evaluations across various RS datasets demonstrate that AACL achieves competitive performance in semi-supervised segmentation, showing up to a 20% improvement in specific categories and 2% increase in overall performance compared to state-of-the-art frameworks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09344v1-abstract-full').style.display = 'none'; document.getElementById('2411.09344v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Conference on Neural Information Processing 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08726">arXiv:2411.08726</a> <span> [<a href="https://arxiv.org/pdf/2411.08726">pdf</a>, <a href="https://arxiv.org/format/2411.08726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> </div> </div> <p class="title is-5 mathjax"> Analyst Reports and Stock Performance: Evidence from the Chinese Market </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+R">Rui Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jiayou Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haolong Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yujia Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08726v1-abstract-short" style="display: inline;"> This article applies natural language processing (NLP) to extract and quantify textual information to predict stock performance. Using an extensive dataset of Chinese analyst reports and employing a customized BERT deep learning model for Chinese text, this study categorizes the sentiment of the reports as positive, neutral, or negative. The findings underscore the predictive capacity of this sent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08726v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08726v1-abstract-full" style="display: none;"> This article applies natural language processing (NLP) to extract and quantify textual information to predict stock performance. Using an extensive dataset of Chinese analyst reports and employing a customized BERT deep learning model for Chinese text, this study categorizes the sentiment of the reports as positive, neutral, or negative. The findings underscore the predictive capacity of this sentiment indicator for stock volatility, excess returns, and trading volume. Specifically, analyst reports with strong positive sentiment will increase excess return and intraday volatility, and vice versa, reports with strong negative sentiment also increase volatility and trading volume, but decrease future excess return. The magnitude of this effect is greater for positive sentiment reports than for negative sentiment reports. This article contributes to the empirical literature on sentiment analysis and the response of the stock market to news in the Chinese stock market. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08726v1-abstract-full').style.display = 'none'; document.getElementById('2411.08726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08347">arXiv:2411.08347</a> <span> [<a href="https://arxiv.org/pdf/2411.08347">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> A Chinese Multi-label Affective Computing Dataset Based on Social Media Network Users </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Senlin Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haofan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08347v1-abstract-short" style="display: inline;"> Emotion and personality are central elements in understanding human psychological states. Emotions reflect an individual subjective experiences, while personality reveals relatively stable behavioral and cognitive patterns. Existing affective computing datasets often annotate emotion and personality traits separately, lacking fine-grained labeling of micro-emotions and emotion intensity in both si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08347v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08347v1-abstract-full" style="display: none;"> Emotion and personality are central elements in understanding human psychological states. Emotions reflect an individual subjective experiences, while personality reveals relatively stable behavioral and cognitive patterns. Existing affective computing datasets often annotate emotion and personality traits separately, lacking fine-grained labeling of micro-emotions and emotion intensity in both single-label and multi-label classifications. Chinese emotion datasets are extremely scarce, and datasets capturing Chinese user personality traits are even more limited. To address these gaps, this study collected data from the major social media platform Weibo, screening 11,338 valid users from over 50,000 individuals with diverse MBTI personality labels and acquiring 566,900 posts along with the user MBTI personality tags. Using the EQN method, we compiled a multi-label Chinese affective computing dataset that integrates the same user's personality traits with six emotions and micro-emotions, each annotated with intensity levels. Validation results across multiple NLP classification models demonstrate the dataset strong utility. This dataset is designed to advance machine recognition of complex human emotions and provide data support for research in psychology, education, marketing, finance, and politics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08347v1-abstract-full').style.display = 'none'; document.getElementById('2411.08347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07660">arXiv:2411.07660</a> <span> [<a href="https://arxiv.org/pdf/2411.07660">pdf</a>, <a href="https://arxiv.org/format/2411.07660">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HMIL: Hierarchical Multi-Instance Learning for Fine-Grained Whole Slide Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Luyang Luo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huangjing Lin</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Jun Hou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07660v1-abstract-short" style="display: inline;"> Fine-grained classification of whole slide images (WSIs) is essential in precision oncology, enabling precise cancer diagnosis and personalized treatment strategies. The core of this task involves distinguishing subtle morphological variations within the same broad category of gigapixel-resolution images, which presents a significant challenge. While the multi-instance learning (MIL) paradigm alle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07660v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07660v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07660v1-abstract-full" style="display: none;"> Fine-grained classification of whole slide images (WSIs) is essential in precision oncology, enabling precise cancer diagnosis and personalized treatment strategies. The core of this task involves distinguishing subtle morphological variations within the same broad category of gigapixel-resolution images, which presents a significant challenge. While the multi-instance learning (MIL) paradigm alleviates the computational burden of WSIs, existing MIL methods often overlook hierarchical label correlations, treating fine-grained classification as a flat multi-class classification task. To overcome these limitations, we introduce a novel hierarchical multi-instance learning (HMIL) framework. By facilitating on the hierarchical alignment of inherent relationships between different hierarchy of labels at instance and bag level, our approach provides a more structured and informative learning process. Specifically, HMIL incorporates a class-wise attention mechanism that aligns hierarchical information at both the instance and bag levels. Furthermore, we introduce supervised contrastive learning to enhance the discriminative capability for fine-grained classification and a curriculum-based dynamic weighting module to adaptively balance the hierarchical feature during training. Extensive experiments on our large-scale cytology cervical cancer (CCC) dataset and two public histology datasets, BRACS and PANDA, demonstrate the state-of-the-art class-wise and overall performance of our HMIL framework. Our source code is available at https://github.com/ChengJin-git/HMIL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07660v1-abstract-full').style.display = 'none'; document.getElementById('2411.07660v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07175">arXiv:2411.07175</a> <span> [<a href="https://arxiv.org/pdf/2411.07175">pdf</a>, <a href="https://arxiv.org/format/2411.07175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Continual Memorization of Factoids in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Howard Chen</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+J">Jiayi Geng</a>, <a href="/search/cs?searchtype=author&query=Bhaskar%2C+A">Adithya Bhaskar</a>, <a href="/search/cs?searchtype=author&query=Friedman%2C+D">Dan Friedman</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07175v1-abstract-short" style="display: inline;"> Large language models can absorb a massive amount of knowledge through pretraining, but pretraining is inefficient for acquiring long-tailed or specialized facts. Therefore, fine-tuning on specialized or new knowledge that reflects changes in the world has become popular, though it risks disrupting the model's original capabilities. We study this fragility in the context of continual memorization,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07175v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07175v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07175v1-abstract-full" style="display: none;"> Large language models can absorb a massive amount of knowledge through pretraining, but pretraining is inefficient for acquiring long-tailed or specialized facts. Therefore, fine-tuning on specialized or new knowledge that reflects changes in the world has become popular, though it risks disrupting the model's original capabilities. We study this fragility in the context of continual memorization, where the model is trained on a small set of long-tail factoids (factual associations) and must retain these factoids after multiple stages of subsequent training on other datasets. Through extensive experiments, we show that LLMs suffer from forgetting across a wide range of subsequent tasks, and simple replay techniques do not fully prevent forgetting, especially when the factoid datasets are trained in the later stages. We posit that there are two ways to alleviate forgetting: 1) protect the memorization process as the model learns the factoids, or 2) reduce interference from training in later stages. With this insight, we develop an effective mitigation strategy: REMIX (Random and Generic Data Mixing). REMIX prevents forgetting by mixing generic data sampled from pretraining corpora or even randomly generated word sequences during each stage, despite being unrelated to the memorized factoids in the first stage. REMIX can recover performance from severe forgetting, often outperforming replay-based methods that have access to the factoids from the first stage. We then analyze how REMIX alters the learning process and find that successful forgetting prevention is associated with a pattern: the model stores factoids in earlier layers than usual and diversifies the set of layers that store these factoids. The efficacy of REMIX invites further investigation into the underlying dynamics of memorization and forgetting, opening exciting possibilities for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07175v1-abstract-full').style.display = 'none'; document.getElementById('2411.07175v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07019">arXiv:2411.07019</a> <span> [<a href="https://arxiv.org/pdf/2411.07019">pdf</a>, <a href="https://arxiv.org/format/2411.07019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> UniHR: Hierarchical Representation Learning for Unified Knowledge Graph Link Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingyang Chen</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+Y">Yin Hua</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziqi Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Lei Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huajun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07019v1-abstract-short" style="display: inline;"> Beyond-triple fact representations including hyper-relational facts with auxiliary key-value pairs, temporal facts with additional timestamps, and nested facts implying relationships between facts, are gaining significant attention. However, existing link prediction models are usually designed for one specific type of facts, making it difficult to generalize to other fact representations. To overc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07019v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07019v1-abstract-full" style="display: none;"> Beyond-triple fact representations including hyper-relational facts with auxiliary key-value pairs, temporal facts with additional timestamps, and nested facts implying relationships between facts, are gaining significant attention. However, existing link prediction models are usually designed for one specific type of facts, making it difficult to generalize to other fact representations. To overcome this limitation, we propose a Unified Hierarchical Representation learning framework (UniHR) for unified knowledge graph link prediction. It consists of a unified Hierarchical Data Representation (HiDR) module and a unified Hierarchical Structure Learning (HiSL) module as graph encoder. The HiDR module unifies hyper-relational KGs, temporal KGs, and nested factual KGs into triple-based representations. Then HiSL incorporates intra-fact and inter-fact message passing, focusing on enhancing the semantic information within individual facts and enriching the structural information between facts. Experimental results across 7 datasets from 3 types of KGs demonstrate that our UniHR outperforms baselines designed for one specific kind of KG, indicating strong generalization capability of HiDR form and the effectiveness of HiSL module. Code and data are available at https://github.com/Lza12a/UniHR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07019v1-abstract-full').style.display = 'none'; document.getElementById('2411.07019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06714">arXiv:2411.06714</a> <span> [<a href="https://arxiv.org/pdf/2411.06714">pdf</a>, <a href="https://arxiv.org/format/2411.06714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DiffSR: Learning Radar Reflectivity Synthesis via Diffusion Model from Satellite Observations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xuming He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhiwang Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenlong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiqi Chen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06714v1-abstract-short" style="display: inline;"> Weather radar data synthesis can fill in data for areas where ground observations are missing. Existing methods often employ reconstruction-based approaches with MSE loss to reconstruct radar data from satellite observation. However, such methods lead to over-smoothing, which hinders the generation of high-frequency details or high-value observation areas associated with convective weather. To add… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06714v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06714v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06714v1-abstract-full" style="display: none;"> Weather radar data synthesis can fill in data for areas where ground observations are missing. Existing methods often employ reconstruction-based approaches with MSE loss to reconstruct radar data from satellite observation. However, such methods lead to over-smoothing, which hinders the generation of high-frequency details or high-value observation areas associated with convective weather. To address this issue, we propose a two-stage diffusion-based method called DiffSR. We first pre-train a reconstruction model on global-scale data to obtain radar estimation and then synthesize radar reflectivity by combining radar estimation results with satellite data as conditions for the diffusion model. Extensive experiments show that our method achieves state-of-the-art (SOTA) results, demonstrating the ability to generate high-frequency details and high-value areas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06714v1-abstract-full').style.display = 'none'; document.getElementById('2411.06714v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06173">arXiv:2411.06173</a> <span> [<a href="https://arxiv.org/pdf/2411.06173">pdf</a>, <a href="https://arxiv.org/format/2411.06173">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LSSInst: Improving Geometric Modeling in LSS-Based BEV Perception with Instance Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+W">Weijie Ma</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jingwei Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zehui Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06173v2-abstract-short" style="display: inline;"> With the attention gained by camera-only 3D object detection in autonomous driving, methods based on Bird-Eye-View (BEV) representation especially derived from the forward view transformation paradigm, i.e., lift-splat-shoot (LSS), have recently seen significant progress. The BEV representation formulated by the frustum based on depth distribution prediction is ideal for learning the road structur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06173v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06173v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06173v2-abstract-full" style="display: none;"> With the attention gained by camera-only 3D object detection in autonomous driving, methods based on Bird-Eye-View (BEV) representation especially derived from the forward view transformation paradigm, i.e., lift-splat-shoot (LSS), have recently seen significant progress. The BEV representation formulated by the frustum based on depth distribution prediction is ideal for learning the road structure and scene layout from multi-view images. However, to retain computational efficiency, the compressed BEV representation such as in resolution and axis is inevitably weak in retaining the individual geometric details, undermining the methodological generality and applicability. With this in mind, to compensate for the missing details and utilize multi-view geometry constraints, we propose LSSInst, a two-stage object detector incorporating BEV and instance representations in tandem. The proposed detector exploits fine-grained pixel-level features that can be flexibly integrated into existing LSS-based BEV networks. Having said that, due to the inherent gap between two representation spaces, we design the instance adaptor for the BEV-to-instance semantic coherence rather than pass the proposal naively. Extensive experiments demonstrated that our proposed framework is of excellent generalization ability and performance, which boosts the performances of modern LSS-based BEV perception methods without bells and whistles and outperforms current LSS-based state-of-the-art works on the large-scale nuScenes benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06173v2-abstract-full').style.display = 'none'; document.getElementById('2411.06173v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 3DV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06160">arXiv:2411.06160</a> <span> [<a href="https://arxiv.org/pdf/2411.06160">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Expansion Quantization Network: An Efficient Micro-emotion Annotation and Detection Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Senlin Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haofan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06160v1-abstract-short" style="display: inline;"> Text emotion detection constitutes a crucial foundation for advancing artificial intelligence from basic comprehension to the exploration of emotional reasoning. Most existing emotion detection datasets rely on manual annotations, which are associated with high costs, substantial subjectivity, and severe label imbalances. This is particularly evident in the inadequate annotation of micro-emotions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06160v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06160v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06160v1-abstract-full" style="display: none;"> Text emotion detection constitutes a crucial foundation for advancing artificial intelligence from basic comprehension to the exploration of emotional reasoning. Most existing emotion detection datasets rely on manual annotations, which are associated with high costs, substantial subjectivity, and severe label imbalances. This is particularly evident in the inadequate annotation of micro-emotions and the absence of emotional intensity representation, which fail to capture the rich emotions embedded in sentences and adversely affect the quality of downstream task completion. By proposing an all-labels and training-set label regression method, we map label values to energy intensity levels, thereby fully leveraging the learning capabilities of machine models and the interdependencies among labels to uncover multiple emotions within samples. This led to the establishment of the Emotion Quantization Network (EQN) framework for micro-emotion detection and annotation. Using five commonly employed sentiment datasets, we conducted comparative experiments with various models, validating the broad applicability of our framework within NLP machine learning models. Based on the EQN framework, emotion detection and annotation are conducted on the GoEmotions dataset. A comprehensive comparison with the results from Google literature demonstrates that the EQN framework possesses a high capability for automatic detection and annotation of micro-emotions. The EQN framework is the first to achieve automatic micro-emotion annotation with energy-level scores, providing strong support for further emotion detection analysis and the quantitative research of emotion computing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06160v1-abstract-full').style.display = 'none'; document.getElementById('2411.06160v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06146">arXiv:2411.06146</a> <span> [<a href="https://arxiv.org/pdf/2411.06146">pdf</a>, <a href="https://arxiv.org/format/2411.06146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AI-Compass: A Comprehensive and Effective Multi-module Testing Tool for AI Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhiyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhibo Jin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hongsheng Hu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Minhui Xue</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoxi Sun</a>, <a href="/search/cs?searchtype=author&query=Camtepe%2C+S">Seyit Camtepe</a>, <a href="/search/cs?searchtype=author&query=Gauravaram%2C+P">Praveen Gauravaram</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huaming Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06146v1-abstract-short" style="display: inline;"> AI systems, in particular with deep learning techniques, have demonstrated superior performance for various real-world applications. Given the need for tailored optimization in specific scenarios, as well as the concerns related to the exploits of subsurface vulnerabilities, a more comprehensive and in-depth testing AI system becomes a pivotal topic. We have seen the emergence of testing tools in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06146v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06146v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06146v1-abstract-full" style="display: none;"> AI systems, in particular with deep learning techniques, have demonstrated superior performance for various real-world applications. Given the need for tailored optimization in specific scenarios, as well as the concerns related to the exploits of subsurface vulnerabilities, a more comprehensive and in-depth testing AI system becomes a pivotal topic. We have seen the emergence of testing tools in real-world applications that aim to expand testing capabilities. However, they often concentrate on ad-hoc tasks, rendering them unsuitable for simultaneously testing multiple aspects or components. Furthermore, trustworthiness issues arising from adversarial attacks and the challenge of interpreting deep learning models pose new challenges for developing more comprehensive and in-depth AI system testing tools. In this study, we design and implement a testing tool, \tool, to comprehensively and effectively evaluate AI systems. The tool extensively assesses multiple measurements towards adversarial robustness, model interpretability, and performs neuron analysis. The feasibility of the proposed testing tool is thoroughly validated across various modalities, including image classification, object detection, and text classification. Extensive experiments demonstrate that \tool is the state-of-the-art tool for a comprehensive assessment of the robustness and trustworthiness of AI systems. Our research sheds light on a general solution for AI systems testing landscape. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06146v1-abstract-full').style.display = 'none'; document.getElementById('2411.06146v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06059">arXiv:2411.06059</a> <span> [<a href="https://arxiv.org/pdf/2411.06059">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> ANCoEF: Asynchronous Neuromorphic Algorithm/Hardware Co-Exploration Framework with a Fully Asynchronous Simulator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingchen Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jilin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06059v1-abstract-short" style="display: inline;"> Developing asynchronous neuromorphic hardware to meet the demands of diverse real-life edge scenarios remains significant challenges. These challenges include constraints on hardware resources and power budgets while satisfying the requirements for real-time responsiveness, reliable inference accuracy, and so on. Besides, the existing system-level simulators for asynchronous neuromorphic hardware… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06059v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06059v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06059v1-abstract-full" style="display: none;"> Developing asynchronous neuromorphic hardware to meet the demands of diverse real-life edge scenarios remains significant challenges. These challenges include constraints on hardware resources and power budgets while satisfying the requirements for real-time responsiveness, reliable inference accuracy, and so on. Besides, the existing system-level simulators for asynchronous neuromorphic hardware suffer from runtime limitations. To address these challenges, we propose an Asynchronous Neuromorphic algorithm/hardware Co-Exploration Framework (ANCoEF) including multi-objective reinforcement learning (RL)-based hardware architecture optimization method, and a fully asynchronous simulator (TrueAsync) which achieves over 2 times runtime speedups than the state-of-the-art (SOTA) simulator. Our experimental results show that, the RL-based hardware architecture optimization approach of ANCoEF outperforms the SOTA method by reducing 1.81 times hardware energy-delay product (EDP) with 2.73 times less search time on N-MNIST dataset, and the co-exploration framework of ANCoEF improves SNN accuracy by 9.72% and reduces hardware EDP by 28.85 times compared to the SOTA work on DVS128Gesture dataset. Furthermore, ANCoEF framework is evaluated on external neuromorphic dataset CIFAR10-DVS, and static datasets including CIFAR10, CIFAR100, SVHN, and Tiny-ImageNet. For instance, after 26.23 ThreadHour of co-exploration process, the result on CIFAR10-DVS dataset achieves an SNN accuracy of 98.48% while consuming hardware EDP of 0.54 s nJ per sample. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06059v1-abstract-full').style.display = 'none'; document.getElementById('2411.06059v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06041">arXiv:2411.06041</a> <span> [<a href="https://arxiv.org/pdf/2411.06041">pdf</a>, <a href="https://arxiv.org/format/2411.06041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PointCG: Self-supervised Point Cloud Learning via Joint Completion and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yun Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xuefeng Yan</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+L">Liangliang Nan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bing Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Honghua Chen</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+L">Lina Gong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+M">Mingqiang Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06041v1-abstract-short" style="display: inline;"> The core of self-supervised point cloud learning lies in setting up appropriate pretext tasks, to construct a pre-training framework that enables the encoder to perceive 3D objects effectively. In this paper, we integrate two prevalent methods, masked point modeling (MPM) and 3D-to-2D generation, as pretext tasks within a pre-training framework. We leverage the spatial awareness and precise superv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06041v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06041v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06041v1-abstract-full" style="display: none;"> The core of self-supervised point cloud learning lies in setting up appropriate pretext tasks, to construct a pre-training framework that enables the encoder to perceive 3D objects effectively. In this paper, we integrate two prevalent methods, masked point modeling (MPM) and 3D-to-2D generation, as pretext tasks within a pre-training framework. We leverage the spatial awareness and precise supervision offered by these two methods to address their respective limitations: ambiguous supervision signals and insensitivity to geometric information. Specifically, the proposed framework, abbreviated as PointCG, consists of a Hidden Point Completion (HPC) module and an Arbitrary-view Image Generation (AIG) module. We first capture visible points from arbitrary views as inputs by removing hidden points. Then, HPC extracts representations of the inputs with an encoder and completes the entire shape with a decoder, while AIG is used to generate rendered images based on the visible points' representations. Extensive experiments demonstrate the superiority of the proposed method over the baselines in various downstream tasks. Our code will be made available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06041v1-abstract-full').style.display = 'none'; document.getElementById('2411.06041v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06023">arXiv:2411.06023</a> <span> [<a href="https://arxiv.org/pdf/2411.06023">pdf</a>, <a href="https://arxiv.org/format/2411.06023">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Textual Prompt For Rehearsal-free Lifelong Person Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+B">Bingliang Jiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06023v1-abstract-short" style="display: inline;"> Lifelong person re-identification attempts to recognize people across cameras and integrate new knowledge from continuous data streams. Key challenges involve addressing catastrophic forgetting caused by parameter updating and domain shift, and maintaining performance in seen and unseen domains. Many previous works rely on data memories to retain prior samples. However, the amount of retained data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06023v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06023v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06023v1-abstract-full" style="display: none;"> Lifelong person re-identification attempts to recognize people across cameras and integrate new knowledge from continuous data streams. Key challenges involve addressing catastrophic forgetting caused by parameter updating and domain shift, and maintaining performance in seen and unseen domains. Many previous works rely on data memories to retain prior samples. However, the amount of retained data increases linearly with the number of training domains, leading to continually increasing memory consumption. Additionally, these methods may suffer significant performance degradation when data preservation is prohibited due to privacy concerns. To address these limitations, we propose using textual descriptions as guidance to encourage the ReID model to learn cross-domain invariant features without retaining samples. The key insight is that natural language can describe pedestrian instances with an invariant style, suggesting a shared textual space for any pedestrian images. By leveraging this shared textual space as an anchor, we can prompt the ReID model to embed images from various domains into a unified semantic space, thereby alleviating catastrophic forgetting caused by domain shifts. To achieve this, we introduce a task-driven dynamic textual prompt framework in this paper. This model features a dynamic prompt fusion module, which adaptively constructs and fuses two different textual prompts as anchors. This effectively guides the ReID model to embed images into a unified semantic space. Additionally, we design a text-visual feature alignment module to learn a more precise mapping between fine-grained visual and textual features. We also developed a learnable knowledge distillation module that allows our model to dynamically balance retaining existing knowledge with acquiring new knowledge. Extensive experiments demonstrate that our method outperforms SOTAs under various settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06023v1-abstract-full').style.display = 'none'; document.getElementById('2411.06023v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05504">arXiv:2411.05504</a> <span> [<a href="https://arxiv.org/pdf/2411.05504">pdf</a>, <a href="https://arxiv.org/format/2411.05504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LBPE: Long-token-first Tokenization to Improve Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lian%2C+H">Haoran Lian</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yizhe Xiong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zijia Lin</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Jianwei Niu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+S">Shasha Mo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+G">Guiguang Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05504v1-abstract-short" style="display: inline;"> The prevalent use of Byte Pair Encoding (BPE) in Large Language Models (LLMs) facilitates robust handling of subword units and avoids issues of out-of-vocabulary words. Despite its success, a critical challenge persists: long tokens, rich in semantic information, have fewer occurrences in tokenized datasets compared to short tokens, which can result in imbalanced learning issue across different to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05504v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05504v1-abstract-full" style="display: none;"> The prevalent use of Byte Pair Encoding (BPE) in Large Language Models (LLMs) facilitates robust handling of subword units and avoids issues of out-of-vocabulary words. Despite its success, a critical challenge persists: long tokens, rich in semantic information, have fewer occurrences in tokenized datasets compared to short tokens, which can result in imbalanced learning issue across different tokens. To address that, we propose LBPE, which prioritizes long tokens during the encoding process. LBPE generates tokens according to their reverse ranks of token length rather than their ranks in the vocabulary, granting longer tokens higher priority during the encoding process. Consequently, LBPE smooths the frequency differences between short and long tokens, and thus mitigates the learning imbalance. Extensive experiments across diverse language modeling tasks demonstrate that LBPE consistently outperforms the original BPE, well demonstrating its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05504v1-abstract-full').style.display = 'none'; document.getElementById('2411.05504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2404.17808</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05420">arXiv:2411.05420</a> <span> [<a href="https://arxiv.org/pdf/2411.05420">pdf</a>, <a href="https://arxiv.org/format/2411.05420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> WeatherGFM: Learning A Weather Generalist Foundation Model via In-context Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhiwang Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenlong Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yihao Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+J">Junchao Gong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+B">Ben Fei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiqi Chen</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiao-Ming Wu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05420v1-abstract-short" style="display: inline;"> The Earth's weather system encompasses intricate weather data modalities and diverse weather understanding tasks, which hold significant value to human life. Existing data-driven models focus on single weather understanding tasks (e.g., weather forecasting). Although these models have achieved promising results, they fail to tackle various complex tasks within a single and unified model. Moreover,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05420v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05420v1-abstract-full" style="display: none;"> The Earth's weather system encompasses intricate weather data modalities and diverse weather understanding tasks, which hold significant value to human life. Existing data-driven models focus on single weather understanding tasks (e.g., weather forecasting). Although these models have achieved promising results, they fail to tackle various complex tasks within a single and unified model. Moreover, the paradigm that relies on limited real observations for a single scenario hinders the model's performance upper bound. In response to these limitations, we draw inspiration from the in-context learning paradigm employed in state-of-the-art visual foundation models and large language models. In this paper, we introduce the first generalist weather foundation model (WeatherGFM), designed to address a wide spectrum of weather understanding tasks in a unified manner. More specifically, we initially unify the representation and definition of the diverse weather understanding tasks. Subsequently, we devised weather prompt formats to manage different weather data modalities, namely single, multiple, and temporal modalities. Finally, we adopt a visual prompting question-answering paradigm for the training of unified weather understanding tasks. Extensive experiments indicate that our WeatherGFM can effectively handle up to ten weather understanding tasks, including weather forecasting, super-resolution, weather image translation, and post-processing. Our method also showcases generalization ability on unseen tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05420v1-abstract-full').style.display = 'none'; document.getElementById('2411.05420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05010">arXiv:2411.05010</a> <span> [<a href="https://arxiv.org/pdf/2411.05010">pdf</a>, <a href="https://arxiv.org/format/2411.05010">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scattered Forest Search: Smarter Code Space Exploration with LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Light%2C+J">Jonathan Light</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yue Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yiyou Sun</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenchao Yu</a>, <a href="/search/cs?searchtype=author&query=liu%2C+Y">Yanchi liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xujiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Ziniu Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05010v1-abstract-short" style="display: inline;"> We propose a novel approach to scaling LLM inference for code generation. We frame code generation as a black box optimization problem within the code space, and employ optimization-inspired techniques to enhance exploration. Specifically, we introduce Scattered Forest Search to enhance solution diversity while searching for solutions. Our theoretical analysis illustrates how these methods avoid l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05010v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05010v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05010v1-abstract-full" style="display: none;"> We propose a novel approach to scaling LLM inference for code generation. We frame code generation as a black box optimization problem within the code space, and employ optimization-inspired techniques to enhance exploration. Specifically, we introduce Scattered Forest Search to enhance solution diversity while searching for solutions. Our theoretical analysis illustrates how these methods avoid local optima during optimization. Extensive experiments on HumanEval, MBPP, APPS, CodeContests, and Leetcode reveal significant performance improvements. For instance, our method achieves a pass@1 rate of 67.1% on HumanEval+ and 87.2% on HumanEval with GPT-3.5, marking improvements of 8.6% and 4.3% over the state-of-the-art, while also halving the iterations needed to find the correct solution. Furthermore, our method scales more efficiently than existing search techniques, including tree search, line search, and repeated sampling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05010v1-abstract-full').style.display = 'none'; document.getElementById('2411.05010v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04862">arXiv:2411.04862</a> <span> [<a href="https://arxiv.org/pdf/2411.04862">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Sentiment Analysis of Spanish Political Party Tweets Using Pre-trained Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+C">Chuqiao Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shunzhang Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xinyi Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04862v1-abstract-short" style="display: inline;"> Title: Sentiment Analysis of Spanish Political Party Communications on Twitter Using Pre-trained Language Models Authors: Chuqiao Song, Shunzhang Chen, Xinyi Cai, Hao Chen Comments: 21 pages, 6 figures Abstract: This study investigates sentiment patterns within Spanish political party communications on Twitter by leveraging BETO and RoBERTuito, two pre-trained language models optimized for S… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04862v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04862v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04862v1-abstract-full" style="display: none;"> Title: Sentiment Analysis of Spanish Political Party Communications on Twitter Using Pre-trained Language Models Authors: Chuqiao Song, Shunzhang Chen, Xinyi Cai, Hao Chen Comments: 21 pages, 6 figures Abstract: This study investigates sentiment patterns within Spanish political party communications on Twitter by leveraging BETO and RoBERTuito, two pre-trained language models optimized for Spanish text. Using a dataset of tweets from major Spanish political parties: PSOE, PP, Vox, Podemos, and Ciudadanos, spanning 2019 to 2024, this research analyzes sentiment distributions and explores the relationship between sentiment expression and party ideology. The findings indicate that both models consistently identify a predominant Neutral sentiment across all parties, with significant variations in Negative and Positive sentiments that align with ideological distinctions. Specifically, Vox exhibits higher levels of Negative sentiment, while PSOE demonstrates relatively high Positive sentiment, supporting the hypothesis that emotional appeals in political messaging reflect ideological stances. This study underscores the potential of pre-trained language models for non-English sentiment analysis on social media, providing insights into sentiment dynamics that shape public discourse within Spain's multi-party political system. Keywords: Spanish politics, sentiment analysis, pre-trained language models, Twitter, BETO, RoBERTuito, political ideology, multi-party system <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04862v1-abstract-full').style.display = 'none'; document.getElementById('2411.04862v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50 (Natural Language Processing); 68T10 (Pattern Recognition; Speech Recognition); 91F10 (Political Science) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04282">arXiv:2411.04282</a> <span> [<a href="https://arxiv.org/pdf/2411.04282">pdf</a>, <a href="https://arxiv.org/format/2411.04282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Language Models are Hidden Reasoners: Unlocking Latent Reasoning Capabilities via Self-Rewarding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haolin Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yihao Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&query=Prabhakar%2C+A">Akshara Prabhakar</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+R">Ricky Ho</a>, <a href="/search/cs?searchtype=author&query=Mui%2C+P">Phil Mui</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04282v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown impressive capabilities, but still struggle with complex reasoning tasks requiring multiple steps. While prompt-based methods like Chain-of-Thought (CoT) can improve LLM reasoning at inference time, optimizing reasoning capabilities during training remains challenging. We introduce LaTent Reasoning Optimization (LaTRO), a principled framework that formulates… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04282v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04282v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown impressive capabilities, but still struggle with complex reasoning tasks requiring multiple steps. While prompt-based methods like Chain-of-Thought (CoT) can improve LLM reasoning at inference time, optimizing reasoning capabilities during training remains challenging. We introduce LaTent Reasoning Optimization (LaTRO), a principled framework that formulates reasoning as sampling from a latent distribution and optimizes it via variational approaches. LaTRO enables LLMs to concurrently improve both their reasoning process and ability to evaluate reasoning quality, without requiring external feedback or reward models. We validate LaTRO through experiments on GSM8K and ARC-Challenge datasets using multiple model architectures. On GSM8K, LaTRO improves zero-shot accuracy by an average of 12.5% over base models and 9.6% over supervised fine-tuning across Phi-3.5-mini, Mistral-7B, and Llama-3.1-8B. Our findings suggest that pre-trained LLMs possess latent reasoning capabilities that can be unlocked and enhanced through our proposed optimization approach in a self-improvement manner. The code of LaTRO is available at \url{https://github.com/SalesforceAIResearch/LaTRO}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04282v1-abstract-full').style.display = 'none'; document.getElementById('2411.04282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03670">arXiv:2411.03670</a> <span> [<a href="https://arxiv.org/pdf/2411.03670">pdf</a>, <a href="https://arxiv.org/format/2411.03670">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Touchstone Benchmark: Are We on the Right Way for Evaluating AI Algorithms for Medical Segmentation? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bassi%2C+P+R+A+S">Pedro R. A. S. Bassi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenxuan Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&query=Isensee%2C+F">Fabian Isensee</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zifu Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jieneng Chen</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+Y">Yu-Cheng Chou</a>, <a href="/search/cs?searchtype=author&query=Kirchhoff%2C+Y">Yannick Kirchhoff</a>, <a href="/search/cs?searchtype=author&query=Rokuss%2C+M">Maximilian Rokuss</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziyan Huang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a>, <a href="/search/cs?searchtype=author&query=Wald%2C+T">Tassilo Wald</a>, <a href="/search/cs?searchtype=author&query=Ulrich%2C+C">Constantin Ulrich</a>, <a href="/search/cs?searchtype=author&query=Baumgartner%2C+M">Michael Baumgartner</a>, <a href="/search/cs?searchtype=author&query=Roy%2C+S">Saikat Roy</a>, <a href="/search/cs?searchtype=author&query=Maier-Hein%2C+K+H">Klaus H. Maier-Hein</a>, <a href="/search/cs?searchtype=author&query=Jaeger%2C+P">Paul Jaeger</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yiwen Ye</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yutong Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyang Chen</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yong Xia</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Z">Zhaohu Xing</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lei Zhu</a> , et al. (28 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03670v1-abstract-short" style="display: inline;"> How can we test AI performance? This question seems trivial, but it isn't. Standard benchmarks often have problems such as in-distribution and small-size test sets, oversimplified metrics, unfair comparisons, and short-term outcome pressure. As a consequence, good performance on standard benchmarks does not guarantee success in real-world scenarios. To address these problems, we present Touchstone… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03670v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03670v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03670v1-abstract-full" style="display: none;"> How can we test AI performance? This question seems trivial, but it isn't. Standard benchmarks often have problems such as in-distribution and small-size test sets, oversimplified metrics, unfair comparisons, and short-term outcome pressure. As a consequence, good performance on standard benchmarks does not guarantee success in real-world scenarios. To address these problems, we present Touchstone, a large-scale collaborative segmentation benchmark of 9 types of abdominal organs. This benchmark is based on 5,195 training CT scans from 76 hospitals around the world and 5,903 testing CT scans from 11 additional hospitals. This diverse test set enhances the statistical significance of benchmark results and rigorously evaluates AI algorithms across various out-of-distribution scenarios. We invited 14 inventors of 19 AI algorithms to train their algorithms, while our team, as a third party, independently evaluated these algorithms on three test sets. In addition, we also evaluated pre-existing AI frameworks--which, differing from algorithms, are more flexible and can support different algorithms--including MONAI from NVIDIA, nnU-Net from DKFZ, and numerous other open-source frameworks. We are committed to expanding this benchmark to encourage more innovation of AI algorithms for the medical domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03670v1-abstract-full').style.display = 'none'; document.getElementById('2411.03670v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03357">arXiv:2411.03357</a> <span> [<a href="https://arxiv.org/pdf/2411.03357">pdf</a>, <a href="https://arxiv.org/format/2411.03357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> PipeLLM: Fast and Confidential Large Language Model Services with Speculative Pipelined Encryption </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yifan Tan</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+Z">Zeyu Mi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haibo Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03357v1-abstract-short" style="display: inline;"> Confidential computing on GPUs, like NVIDIA H100, mitigates the security risks of outsourced Large Language Models (LLMs) by implementing strong isolation and data encryption. Nonetheless, this encryption incurs a significant performance overhead, reaching up to 52.8 percent and 88.2 percent throughput drop when serving OPT-30B and OPT-66B, respectively. To address this challenge, we introduce Pip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03357v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03357v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03357v1-abstract-full" style="display: none;"> Confidential computing on GPUs, like NVIDIA H100, mitigates the security risks of outsourced Large Language Models (LLMs) by implementing strong isolation and data encryption. Nonetheless, this encryption incurs a significant performance overhead, reaching up to 52.8 percent and 88.2 percent throughput drop when serving OPT-30B and OPT-66B, respectively. To address this challenge, we introduce PipeLLM, a user-transparent runtime system. PipeLLM removes the overhead by overlapping the encryption and GPU computation through pipelining - an idea inspired by the CPU instruction pipelining - thereby effectively concealing the latency increase caused by encryption. The primary technical challenge is that, unlike CPUs, the encryption module lacks prior knowledge of the specific data needing encryption until it is requested by the GPUs. To this end, we propose speculative pipelined encryption to predict the data requiring encryption by analyzing the serving patterns of LLMs. Further, we have developed an efficient, low-cost pipeline relinquishing approach for instances of incorrect predictions. Our experiments on NVIDIA H100 GPU show that compared with vanilla systems without confidential computing (e.g., vLLM, PEFT, and FlexGen), PipeLLM incurs modest overhead (less than 19.6 percent in throughput) across various LLM sizes, from 13B to 175B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03357v1-abstract-full').style.display = 'none'; document.getElementById('2411.03357v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in ASPLOS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03286">arXiv:2411.03286</a> <span> [<a href="https://arxiv.org/pdf/2411.03286">pdf</a>, <a href="https://arxiv.org/format/2411.03286">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiT4Edit: Diffusion Transformer for Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+K">Kunyu Feng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yue Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haozhe Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03286v2-abstract-short" style="display: inline;"> Despite recent advances in UNet-based image editing, methods for shape-aware object editing in high-resolution images are still lacking. Compared to UNet, Diffusion Transformers (DiT) demonstrate superior capabilities to effectively capture the long-range dependencies among patches, leading to higher-quality image generation. In this paper, we propose DiT4Edit, the first Diffusion Transformer-base… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03286v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03286v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03286v2-abstract-full" style="display: none;"> Despite recent advances in UNet-based image editing, methods for shape-aware object editing in high-resolution images are still lacking. Compared to UNet, Diffusion Transformers (DiT) demonstrate superior capabilities to effectively capture the long-range dependencies among patches, leading to higher-quality image generation. In this paper, we propose DiT4Edit, the first Diffusion Transformer-based image editing framework. Specifically, DiT4Edit uses the DPM-Solver inversion algorithm to obtain the inverted latents, reducing the number of steps compared to the DDIM inversion algorithm commonly used in UNet-based frameworks. Additionally, we design unified attention control and patches merging, tailored for transformer computation streams. This integration allows our framework to generate higher-quality edited images faster. Our design leverages the advantages of DiT, enabling it to surpass UNet structures in image editing, especially in high-resolution and arbitrary-size images. Extensive experiments demonstrate the strong performance of DiT4Edit across various editing scenarios, highlighting the potential of Diffusion Transformers in supporting image editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03286v2-abstract-full').style.display = 'none'; document.getElementById('2411.03286v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03053">arXiv:2411.03053</a> <span> [<a href="https://arxiv.org/pdf/2411.03053">pdf</a>, <a href="https://arxiv.org/format/2411.03053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gradient-Guided Conditional Diffusion Models for Private Image Reconstruction: Analyzing Adversarial Impacts of Differential Privacy and Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tao Huang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+J">Jiayang Meng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+G">Guolong Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xu Yang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xun Yi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hua Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03053v1-abstract-short" style="display: inline;"> We investigate the construction of gradient-guided conditional diffusion models for reconstructing private images, focusing on the adversarial interplay between differential privacy noise and the denoising capabilities of diffusion models. While current gradient-based reconstruction methods struggle with high-resolution images due to computational complexity and prior knowledge requirements, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03053v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03053v1-abstract-full" style="display: none;"> We investigate the construction of gradient-guided conditional diffusion models for reconstructing private images, focusing on the adversarial interplay between differential privacy noise and the denoising capabilities of diffusion models. While current gradient-based reconstruction methods struggle with high-resolution images due to computational complexity and prior knowledge requirements, we propose two novel methods that require minimal modifications to the diffusion model's generation process and eliminate the need for prior knowledge. Our approach leverages the strong image generation capabilities of diffusion models to reconstruct private images starting from randomly generated noise, even when a small amount of differentially private noise has been added to the gradients. We also conduct a comprehensive theoretical analysis of the impact of differential privacy noise on the quality of reconstructed images, revealing the relationship among noise magnitude, the architecture of attacked models, and the attacker's reconstruction capability. Additionally, extensive experiments validate the effectiveness of our proposed methods and the accuracy of our theoretical findings, suggesting new directions for privacy risk auditing using conditional diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03053v1-abstract-full').style.display = 'none'; document.getElementById('2411.03053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02720">arXiv:2411.02720</a> <span> [<a href="https://arxiv.org/pdf/2411.02720">pdf</a>, <a href="https://arxiv.org/ps/2411.02720">ps</a>, <a href="https://arxiv.org/format/2411.02720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Self-Dual Cyclic Codes with Square-Root-Like Lower Bounds on Their Minimum Distances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Cunsheng Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02720v1-abstract-short" style="display: inline;"> Binary self-dual cyclic codes have been studied since the classical work of Sloane and Thompson published in IEEE Trans. Inf. Theory, vol. 29, 1983. Twenty five years later, an infinite family of binary self-dual cyclic codes with lengths $n_i$ and minimum distances $d_i \geq \frac{1}{2} \sqrt{n_i+2}$ was presented in a paper of IEEE Trans. Inf. Theory, vol. 55, 2009. However, no infinite family o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02720v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02720v1-abstract-full" style="display: none;"> Binary self-dual cyclic codes have been studied since the classical work of Sloane and Thompson published in IEEE Trans. Inf. Theory, vol. 29, 1983. Twenty five years later, an infinite family of binary self-dual cyclic codes with lengths $n_i$ and minimum distances $d_i \geq \frac{1}{2} \sqrt{n_i+2}$ was presented in a paper of IEEE Trans. Inf. Theory, vol. 55, 2009. However, no infinite family of Euclidean self-dual binary cyclic codes whose minimum distances have the square-root lower bound and no infinite family of Euclidean self-dual nonbinary cyclic codes whose minimum distances have a lower bound better than the square-root lower bound are known in the literature. In this paper, an infinite family of Euclidean self-dual cyclic codes over the fields ${\bf F}_{2^s}$ with a square-root-like lower bound is constructed. An infinite subfamily of this family consists of self-dual binary cyclic codes with the square-root lower bound. Another infinite subfamily of this family consists of self-dual cyclic codes over the fields ${\bf F}_{2^s}$ with a lower bound better than the square-root bound for $s \geq 2$. Consequently, two breakthroughs in coding theory are made in this paper. An infinite family of self-dual binary cyclic codes with a square-root-like lower bound is also presented in this paper. An infinite family of Hermitian self-dual cyclic codes over the fields ${\bf F}_{2^{2s}}$ with a square-root-like lower bound and an infinite family of Euclidean self-dual linear codes over ${\bf F}_{q}$ with $q \equiv 1 \pmod{4}$ with a square-root-like lower bound are also constructed in this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02720v1-abstract-full').style.display = 'none'; document.getElementById('2411.02720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02256">arXiv:2411.02256</a> <span> [<a href="https://arxiv.org/pdf/2411.02256">pdf</a>, <a href="https://arxiv.org/format/2411.02256">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unified Speech Recognition: A Single Model for Auditory, Visual, and Audiovisual Inputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haliassos%2C+A">Alexandros Haliassos</a>, <a href="/search/cs?searchtype=author&query=Mira%2C+R">Rodrigo Mira</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Honglie Chen</a>, <a href="/search/cs?searchtype=author&query=Landgraf%2C+Z">Zoe Landgraf</a>, <a href="/search/cs?searchtype=author&query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02256v1-abstract-short" style="display: inline;"> Research in auditory, visual, and audiovisual speech recognition (ASR, VSR, and AVSR, respectively) has traditionally been conducted independently. Even recent self-supervised studies addressing two or all three tasks simultaneously tend to yield separate models, leading to disjoint inference pipelines with increased memory requirements and redundancies. This paper proposes unified training strate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02256v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02256v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02256v1-abstract-full" style="display: none;"> Research in auditory, visual, and audiovisual speech recognition (ASR, VSR, and AVSR, respectively) has traditionally been conducted independently. Even recent self-supervised studies addressing two or all three tasks simultaneously tend to yield separate models, leading to disjoint inference pipelines with increased memory requirements and redundancies. This paper proposes unified training strategies for these systems. We demonstrate that training a single model for all three tasks enhances VSR and AVSR performance, overcoming typical optimisation challenges when training from scratch. Moreover, we introduce a greedy pseudo-labelling approach to more effectively leverage unlabelled samples, addressing shortcomings in related self-supervised methods. Finally, we develop a self-supervised pre-training method within our framework, proving its effectiveness alongside our semi-supervised approach. Despite using a single model for all tasks, our unified approach achieves state-of-the-art performance compared to recent methods on LRS3 and LRS2 for ASR, VSR, and AVSR, as well as on the newly released WildVSR dataset. Code and models are available at https://github.com/ahaliassos/usr. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02256v1-abstract-full').style.display = 'none'; document.getElementById('2411.02256v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024. Code: https://github.com/ahaliassos/usr</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02108">arXiv:2411.02108</a> <span> [<a href="https://arxiv.org/pdf/2411.02108">pdf</a>, <a href="https://arxiv.org/ps/2411.02108">ps</a>, <a href="https://arxiv.org/format/2411.02108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Optimizing AoI at Query in Multiuser Wireless Uplink Networks: A Whittle Index Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingwei Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">He Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02108v4-abstract-short" style="display: inline;"> In this paper, we explore how to schedule multiple users to optimize information freshness in a pull-based wireless network, where the status updates from users are requested by randomly arriving queries at the destination. We use the age of information at query (QAoI) to characterize the performance of information freshness. Such a decision-making problem is naturally modeled as a Markov decision… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02108v4-abstract-full').style.display = 'inline'; document.getElementById('2411.02108v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02108v4-abstract-full" style="display: none;"> In this paper, we explore how to schedule multiple users to optimize information freshness in a pull-based wireless network, where the status updates from users are requested by randomly arriving queries at the destination. We use the age of information at query (QAoI) to characterize the performance of information freshness. Such a decision-making problem is naturally modeled as a Markov decision process (MDP), which, however, is prohibitively high to be solved optimally by the standard method due to the curse of dimensionality. To address this issue, we employ Whittle index approach, which allows us to decouple the original MDP into multiple sub-MDPs by relaxing the scheduling constraints. However, the binary Markovian query arrival process results in a bi-dimensional state and complex state transitions within each sub-MDP, making it challenging to verify Whittle indexability using conventional methods. After a thorough analysis of the sub-MDP's structure, we show that it is unichain and its optimal policy follows a threshold-type structure. This facilitates the verification of Whittle indexability of the sub-MDP by employing an easy-to-verify condition. Subsequently, the steady-state probability distributions of the sub-MDP under different threshold-type policies are analyzed, constituting the analytical expressions of different Whittle indices in terms of the expected average QAoI and scheduling time of the sub-MDP. Building on these, we devise an efficient algorithm to calculate Whittle indices for the formulated sub-MDPs. The simulation results validate our analyses and show the proposed Whittle index policy outperforms baseline policies and achieves near-optimal performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02108v4-abstract-full').style.display = 'none'; document.getElementById('2411.02108v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02059">arXiv:2411.02059</a> <span> [<a href="https://arxiv.org/pdf/2411.02059">pdf</a>, <a href="https://arxiv.org/format/2411.02059">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> TableGPT2: A Large Multimodal Model with Tabular Data Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+A">Aofeng Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Aowen Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+C">Chao Ye</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chen Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ga Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guangcheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haobo Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haokai Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoze Li</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+H">Haoxuan Lan</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jiaming Tian</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jing Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junlin Zhou</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+K">Kaizhe Shou</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+L">Liangyu Zha</a>, <a href="/search/cs?searchtype=author&query=Long%2C+L">Lin Long</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liyao Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Pengzuo Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qingyi Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saisai Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02059v3-abstract-short" style="display: inline;"> The emergence of models like GPTs, Claude, LLaMA, and Qwen has reshaped AI applications, presenting vast new opportunities across industries. Yet, the integration of tabular data remains notably underdeveloped, despite its foundational role in numerous real-world domains. This gap is critical for three main reasons. First, database or data warehouse data integration is essential for advanced app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02059v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02059v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02059v3-abstract-full" style="display: none;"> The emergence of models like GPTs, Claude, LLaMA, and Qwen has reshaped AI applications, presenting vast new opportunities across industries. Yet, the integration of tabular data remains notably underdeveloped, despite its foundational role in numerous real-world domains. This gap is critical for three main reasons. First, database or data warehouse data integration is essential for advanced applications; second, the vast and largely untapped resource of tabular data offers immense potential for analysis; and third, the business intelligence domain specifically demands adaptable, precise solutions that many current LLMs may struggle to provide. In response, we introduce TableGPT2, a model rigorously pre-trained and fine-tuned with over 593.8K tables and 2.36M high-quality query-table-output tuples, a scale of table-related data unprecedented in prior research. This extensive training enables TableGPT2 to excel in table-centric tasks while maintaining strong general language and coding abilities. One of TableGPT2's key innovations is its novel table encoder, specifically designed to capture schema-level and cell-level information. This encoder strengthens the model's ability to handle ambiguous queries, missing column names, and irregular tables commonly encountered in real-world applications. Similar to visual language models, this pioneering approach integrates with the decoder to form a robust large multimodal model. We believe the results are compelling: over 23 benchmarking metrics, TableGPT2 achieves an average performance improvement of 35.20% in the 7B model and 49.32% in the 72B model over prior benchmark-neutral LLMs, with robust general-purpose capabilities intact. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02059v3-abstract-full').style.display = 'none'; document.getElementById('2411.02059v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01445">arXiv:2411.01445</a> <span> [<a href="https://arxiv.org/pdf/2411.01445">pdf</a>, <a href="https://arxiv.org/format/2411.01445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Visual Question Answering Method for SAR Ship: Breaking the Requirement for Multimodal Dataset Construction and Model Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chengcheng Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yugang Chang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Weiming Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01445v1-abstract-short" style="display: inline;"> Current visual question answering (VQA) tasks often require constructing multimodal datasets and fine-tuning visual language models, which demands significant time and resources. This has greatly hindered the application of VQA to downstream tasks, such as ship information analysis based on Synthetic Aperture Radar (SAR) imagery. To address this challenge, this letter proposes a novel VQA approach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01445v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01445v1-abstract-full" style="display: none;"> Current visual question answering (VQA) tasks often require constructing multimodal datasets and fine-tuning visual language models, which demands significant time and resources. This has greatly hindered the application of VQA to downstream tasks, such as ship information analysis based on Synthetic Aperture Radar (SAR) imagery. To address this challenge, this letter proposes a novel VQA approach that integrates object detection networks with visual language models, specifically designed for analyzing ships in SAR images. This integration aims to enhance the capabilities of VQA systems, focusing on aspects such as ship location, density, and size analysis, as well as risk behavior detection. Initially, we conducted baseline experiments using YOLO networks on two representative SAR ship detection datasets, SSDD and HRSID, to assess each model's performance in terms of detection accuracy. Based on these results, we selected the optimal model, YOLOv8n, as the most suitable detection network for this task. Subsequently, leveraging the vision-language model Qwen2-VL, we designed and implemented a VQA task specifically for SAR scenes. This task employs the ship location and size information output by the detection network to generate multi-turn dialogues and scene descriptions for SAR imagery. Experimental results indicate that this method not only enables fundamental SAR scene question-answering without the need for additional datasets or fine-tuning but also dynamically adapts to complex, multi-turn dialogue requirements, demonstrating robust semantic understanding and adaptability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01445v1-abstract-full').style.display = 'none'; document.getElementById('2411.01445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01431">arXiv:2411.01431</a> <span> [<a href="https://arxiv.org/pdf/2411.01431">pdf</a>, <a href="https://arxiv.org/format/2411.01431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Efficient Deep Learning Infrastructures for Embedded Computing Systems: A Comprehensive Survey and Future Envision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiangzhong Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Di Liu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+H">Hao Kong</a>, <a href="/search/cs?searchtype=author&query=Huai%2C+S">Shuo Huai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+G">Guochu Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weichen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01431v1-abstract-short" style="display: inline;"> Deep neural networks (DNNs) have recently achieved impressive success across a wide range of real-world vision and language processing tasks, spanning from image classification to many other downstream vision tasks, such as object detection, tracking, and segmentation. However, previous well-established DNNs, despite being able to maintain superior accuracy, have also been evolving to be deeper an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01431v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01431v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01431v1-abstract-full" style="display: none;"> Deep neural networks (DNNs) have recently achieved impressive success across a wide range of real-world vision and language processing tasks, spanning from image classification to many other downstream vision tasks, such as object detection, tracking, and segmentation. However, previous well-established DNNs, despite being able to maintain superior accuracy, have also been evolving to be deeper and wider and thus inevitably necessitate prohibitive computational resources for both training and inference. This trend further enlarges the computational gap between computation-intensive DNNs and resource-constrained embedded computing systems, making it challenging to deploy powerful DNNs upon real-world embedded computing systems towards ubiquitous embedded intelligence. To alleviate the above computational gap and enable ubiquitous embedded intelligence, we, in this survey, focus on discussing recent efficient deep learning infrastructures for embedded computing systems, spanning from training to inference, from manual to automated, from convolutional neural networks to transformers, from transformers to vision transformers, from vision models to large language models, from software to hardware, and from algorithms to applications. Specifically, we discuss recent efficient deep learning infrastructures for embedded computing systems from the lens of (1) efficient manual network design for embedded computing systems, (2) efficient automated network design for embedded computing systems, (3) efficient network compression for embedded computing systems, (4) efficient on-device learning for embedded computing systems, (5) efficient large language models for embedded computing systems, (6) efficient deep learning software and hardware for embedded computing systems, and (7) efficient intelligent applications for embedded computing systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01431v1-abstract-full').style.display = 'none'; document.getElementById('2411.01431v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACM Transactions on Embedded Computing Systems (TECS) 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository