CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 769 results for author: <span class="mathjax">Ye, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Ye%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ye, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ye%2C+J&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ye, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12814">arXiv:2411.12814</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12814">pdf</a>, <a href="https://arxiv.org/format/2411.12814">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interactive Medical Image Segmentation: A Benchmark Dataset and Baseline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+J">Junlong Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+H">He Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Junren Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">JingWen Li</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+M">Min Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12814v1-abstract-short" style="display: inline;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12814v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12814v1-abstract-full" style="display: none;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 million medical images and their corresponding ground truth masks from multiple data sources. Then, leveraging the strong object recognition capabilities of a vision foundational model, we automatically generated dense interactive masks for each image and ensured their quality through rigorous quality control and granularity management. Unlike previous datasets, which are limited by specific modalities or sparse annotations, IMed-361M spans 14 modalities and 204 segmentation targets, totaling 361 million masks-an average of 56 masks per image. Finally, we developed an IMIS baseline network on this dataset that supports high-quality mask generation through interactive inputs, including clicks, bounding boxes, text prompts, and their combinations. We evaluate its performance on medical image segmentation tasks from multiple perspectives, demonstrating superior accuracy and scalability compared to existing interactive segmentation models. To facilitate research on foundational models in medical computer vision, we release the IMed-361M and model at https://github.com/uni-medical/IMIS-Bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v1-abstract-full').style.display = 'none'; document.getElementById('2411.12814v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09968">arXiv:2411.09968</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09968">pdf</a>, <a href="https://arxiv.org/format/2411.09968">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Seeing Clearly by Layer Two: Enhancing Attention Heads to Alleviate Hallucination in LVLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaofeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Quan%2C+Y">Yihao Quan</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+C">Chaochen Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+X">Xiaosong Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+S">Shaotian Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+K">Kaijie Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09968v1-abstract-short" style="display: inline;"> The hallucination problem in multimodal large language models (MLLMs) remains a common issue. Although image tokens occupy a majority of the input sequence of MLLMs, there is limited research to explore the relationship between image tokens and hallucinations. In this paper, we analyze the distribution of attention scores for image tokens across each layer and head of the model, revealing an intri&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09968v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09968v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09968v1-abstract-full" style="display: none;"> The hallucination problem in multimodal large language models (MLLMs) remains a common issue. Although image tokens occupy a majority of the input sequence of MLLMs, there is limited research to explore the relationship between image tokens and hallucinations. In this paper, we analyze the distribution of attention scores for image tokens across each layer and head of the model, revealing an intriguing and common phenomenon: most hallucinations are closely linked to the pattern of attention sinks in the self-attention matrix of image tokens, where shallow layers exhibit dense attention sinks and deeper layers show sparse attention sinks. We further analyze the attention heads of different layers and find that heads with high-density attention sink in the image part play a positive role in alleviating hallucinations. In this paper, we propose a training-free method named \textcolor{red}{\textbf{E}}nhancing \textcolor{red}{\textbf{A}}ttention \textcolor{red}{\textbf{H}}eads (EAH), an approach designed to enhance the convergence of image tokens attention sinks in the shallow layers. EAH identifies the attention head that shows the vision sink in a shallow layer and extracts its attention matrix. This attention map is then broadcast to other heads in the layer, thereby strengthening the layer to pay more attention to the image itself. With extensive experiments, EAH shows significant hallucination-mitigating performance on different MLLMs and metrics, proving its effectiveness and generality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09968v1-abstract-full').style.display = 'none'; document.getElementById('2411.09968v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08516">arXiv:2411.08516</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08516">pdf</a>, <a href="https://arxiv.org/format/2411.08516">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Tree-of-Table: Unleashing the Power of LLMs for Enhanced Large-Scale Table Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ji%2C+D">Deyi Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+L">Lanyun Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+S">Siqi Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+P">Peng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+H">Hongtao Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jieping Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+F">Feng Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08516v1-abstract-short" style="display: inline;"> The ubiquity and value of tables as semi-structured data across various domains necessitate advanced methods for understanding their complexity and vast amounts of information. Despite the impressive capabilities of large language models (LLMs) in advancing the natural language understanding frontier, their application to large-scale tabular data presents significant challenges, specifically regar&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08516v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08516v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08516v1-abstract-full" style="display: none;"> The ubiquity and value of tables as semi-structured data across various domains necessitate advanced methods for understanding their complexity and vast amounts of information. Despite the impressive capabilities of large language models (LLMs) in advancing the natural language understanding frontier, their application to large-scale tabular data presents significant challenges, specifically regarding table size and complex intricate relationships. Existing works have shown promise with small-scale tables but often flounder when tasked with the complex reasoning required by larger, interconnected tables found in real-world scenarios. To address this gap, we introduce &#34;Tree-of-Table&#34;, a novel approach designed to enhance LLMs&#39; reasoning capabilities over large and complex tables. Our method employs Table Condensation and Decomposition to distill and reorganize relevant data into a manageable format, followed by the construction of a hierarchical Table-Tree that facilitates tree-structured reasoning. Through a meticulous Table-Tree Execution process, we systematically unravel the tree-structured reasoning chain to derive the solutions. Experiments across diverse datasets, including WikiTQ, TableFact, FeTaQA, and BIRD, demonstrate that Tree-of-Table sets a new benchmark with superior performance, showcasing remarkable efficiency and generalization capabilities in large-scale table reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08516v1-abstract-full').style.display = 'none'; document.getElementById('2411.08516v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07574">arXiv:2411.07574</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07574">pdf</a>, <a href="https://arxiv.org/format/2411.07574">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Disentangling Tabular Data towards Better One-Class Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jianan Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Z">Zhaorui Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yijie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+G">Guangliang Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+K">Kaizhu Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07574v1-abstract-short" style="display: inline;"> Tabular anomaly detection under the one-class classification setting poses a significant challenge, as it involves accurately conceptualizing &#34;normal&#34; derived exclusively from a single category to discern anomalies from normal data variations. Capturing the intrinsic correlation among attributes within normal samples presents one promising method for learning the concept. To do so, the most recent&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07574v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07574v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07574v1-abstract-full" style="display: none;"> Tabular anomaly detection under the one-class classification setting poses a significant challenge, as it involves accurately conceptualizing &#34;normal&#34; derived exclusively from a single category to discern anomalies from normal data variations. Capturing the intrinsic correlation among attributes within normal samples presents one promising method for learning the concept. To do so, the most recent effort relies on a learnable mask strategy with a reconstruction task. However, this wisdom may suffer from the risk of producing uniform masks, i.e., essentially nothing is masked, leading to less effective correlation learning. To address this issue, we presume that attributes related to others in normal samples can be divided into two non-overlapping and correlated subsets, defined as CorrSets, to capture the intrinsic correlation effectively. Accordingly, we introduce an innovative method that disentangles CorrSets from normal tabular data. To our knowledge, this is a pioneering effort to apply the concept of disentanglement for one-class anomaly detection on tabular data. Extensive experiments on 20 tabular datasets show that our method substantially outperforms the state-of-the-art methods and leads to an average performance improvement of 6.1% on AUC-PR and 2.1% on AUC-ROC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07574v1-abstract-full').style.display = 'none'; document.getElementById('2411.07574v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05619">arXiv:2411.05619</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05619">pdf</a>, <a href="https://arxiv.org/format/2411.05619">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> WHALE: Towards Generalizable and Scalable World Models for Embodied Decision-making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhilong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+R">Ruifeng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Junyin Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Yihao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Pengyuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Pang%2C+J">Jingcheng Pang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kaiyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianshuo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Haoxin Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Yang Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Zhi-Hua Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05619v1-abstract-short" style="display: inline;"> World models play a crucial role in decision-making within embodied environments, enabling cost-free explorations that would otherwise be expensive in the real world. To facilitate effective decision-making, world models must be equipped with strong generalizability to support faithful imagination in out-of-distribution (OOD) regions and provide reliable uncertainty estimation to assess the credib&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05619v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05619v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05619v1-abstract-full" style="display: none;"> World models play a crucial role in decision-making within embodied environments, enabling cost-free explorations that would otherwise be expensive in the real world. To facilitate effective decision-making, world models must be equipped with strong generalizability to support faithful imagination in out-of-distribution (OOD) regions and provide reliable uncertainty estimation to assess the credibility of the simulated experiences, both of which present significant challenges for prior scalable approaches. This paper introduces WHALE, a framework for learning generalizable world models, consisting of two key techniques: behavior-conditioning and retracing-rollout. Behavior-conditioning addresses the policy distribution shift, one of the primary sources of the world model generalization error, while retracing-rollout enables efficient uncertainty estimation without the necessity of model ensembles. These techniques are universal and can be combined with any neural network architecture for world model learning. Incorporating these two techniques, we present Whale-ST, a scalable spatial-temporal transformer-based world model with enhanced generalizability. We demonstrate the superiority of Whale-ST in simulation tasks by evaluating both value estimation accuracy and video generation fidelity. Additionally, we examine the effectiveness of our uncertainty estimation technique, which enhances model-based policy optimization in fully offline scenarios. Furthermore, we propose Whale-X, a 414M parameter world model trained on 970K trajectories from Open X-Embodiment datasets. We show that Whale-X exhibits promising scalability and strong generalizability in real-world manipulation scenarios using minimal demonstrations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05619v1-abstract-full').style.display = 'none'; document.getElementById('2411.05619v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04669">arXiv:2411.04669</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04669">pdf</a>, <a href="https://arxiv.org/format/2411.04669">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EffiCANet: Efficient Time Series Forecasting with Convolutional Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xinxing Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jiaqi Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S">Shubao Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+M">Ming Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chengyi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Y">Yanlong Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+X">Xiaojie Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04669v1-abstract-short" style="display: inline;"> The exponential growth of multivariate time series data from sensor networks in domains like industrial monitoring and smart cities requires efficient and accurate forecasting models. Current deep learning methods often fail to adequately capture long-range dependencies and complex inter-variable relationships, especially under real-time processing constraints. These limitations arise as many mode&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04669v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04669v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04669v1-abstract-full" style="display: none;"> The exponential growth of multivariate time series data from sensor networks in domains like industrial monitoring and smart cities requires efficient and accurate forecasting models. Current deep learning methods often fail to adequately capture long-range dependencies and complex inter-variable relationships, especially under real-time processing constraints. These limitations arise as many models are optimized for either short-term forecasting with limited receptive fields or long-term accuracy at the cost of efficiency. Additionally, dynamic and intricate interactions between variables in real-world data further complicate modeling efforts. To address these limitations, we propose EffiCANet, an Efficient Convolutional Attention Network designed to enhance forecasting accuracy while maintaining computational efficiency. EffiCANet integrates three key components: (1) a Temporal Large-kernel Decomposed Convolution (TLDC) module that captures long-term temporal dependencies while reducing computational overhead; (2) an Inter-Variable Group Convolution (IVGC) module that captures complex and evolving relationships among variables; and (3) a Global Temporal-Variable Attention (GTVA) mechanism that prioritizes critical temporal and inter-variable features. Extensive evaluations across nine benchmark datasets show that EffiCANet achieves the maximum reduction of 10.02% in MAE over state-of-the-art models, while cutting computational costs by 26.2% relative to conventional large-kernel convolution methods, thanks to its efficient decomposition strategy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04669v1-abstract-full').style.display = 'none'; document.getElementById('2411.04669v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03706">arXiv:2411.03706</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03706">pdf</a>, <a href="https://arxiv.org/format/2411.03706">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> 3DGS-CD: 3D Gaussian Splatting-based Change Detection for Physical Object Rearrangement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Z">Ziqi Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jianbo Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Leonard%2C+J">John Leonard</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03706v1-abstract-short" style="display: inline;"> We present 3DGS-CD, the first 3D Gaussian Splatting (3DGS)-based method for detecting physical object rearrangements in 3D scenes. Our approach estimates 3D object-level changes by comparing two sets of unaligned images taken at different times. Leveraging 3DGS&#39;s novel view rendering and EfficientSAM&#39;s zero-shot segmentation capabilities, we detect 2D object-level changes, which are then associate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03706v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03706v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03706v1-abstract-full" style="display: none;"> We present 3DGS-CD, the first 3D Gaussian Splatting (3DGS)-based method for detecting physical object rearrangements in 3D scenes. Our approach estimates 3D object-level changes by comparing two sets of unaligned images taken at different times. Leveraging 3DGS&#39;s novel view rendering and EfficientSAM&#39;s zero-shot segmentation capabilities, we detect 2D object-level changes, which are then associated and fused across views to estimate 3D changes. Our method can detect changes in cluttered environments using sparse post-change images within as little as 18s, using as few as a single new image. It does not rely on depth input, user instructions, object classes, or object models -- An object is recognized simply if it has been re-arranged. Our approach is evaluated on both public and self-collected real-world datasets, achieving up to 14% higher accuracy and three orders of magnitude faster performance compared to the state-of-the-art radiance-field-based change detection method. This significant performance boost enables a broad range of downstream applications, where we highlight three key use cases: object reconstruction, robot workspace reset, and 3DGS model update. Our code and data will be made available at https://github.com/520xyxyzq/3DGS-CD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03706v1-abstract-full').style.display = 'none'; document.getElementById('2411.03706v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03670">arXiv:2411.03670</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03670">pdf</a>, <a href="https://arxiv.org/format/2411.03670">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Touchstone Benchmark: Are We on the Right Way for Evaluating AI Algorithms for Medical Segmentation? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bassi%2C+P+R+A+S">Pedro R. A. S. Bassi</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenxuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Isensee%2C+F">Fabian Isensee</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zifu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jieneng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chou%2C+Y">Yu-Cheng Chou</a>, <a href="/search/cs?searchtype=author&amp;query=Kirchhoff%2C+Y">Yannick Kirchhoff</a>, <a href="/search/cs?searchtype=author&amp;query=Rokuss%2C+M">Maximilian Rokuss</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Ziyan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Junjun He</a>, <a href="/search/cs?searchtype=author&amp;query=Wald%2C+T">Tassilo Wald</a>, <a href="/search/cs?searchtype=author&amp;query=Ulrich%2C+C">Constantin Ulrich</a>, <a href="/search/cs?searchtype=author&amp;query=Baumgartner%2C+M">Michael Baumgartner</a>, <a href="/search/cs?searchtype=author&amp;query=Roy%2C+S">Saikat Roy</a>, <a href="/search/cs?searchtype=author&amp;query=Maier-Hein%2C+K+H">Klaus H. Maier-Hein</a>, <a href="/search/cs?searchtype=author&amp;query=Jaeger%2C+P">Paul Jaeger</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+Y">Yiwen Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">Yutong Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianpeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Ziyang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+Y">Yong Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Xing%2C+Z">Zhaohu Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+L">Lei Zhu</a> , et al. (28 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03670v1-abstract-short" style="display: inline;"> How can we test AI performance? This question seems trivial, but it isn&#39;t. Standard benchmarks often have problems such as in-distribution and small-size test sets, oversimplified metrics, unfair comparisons, and short-term outcome pressure. As a consequence, good performance on standard benchmarks does not guarantee success in real-world scenarios. To address these problems, we present Touchstone&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03670v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03670v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03670v1-abstract-full" style="display: none;"> How can we test AI performance? This question seems trivial, but it isn&#39;t. Standard benchmarks often have problems such as in-distribution and small-size test sets, oversimplified metrics, unfair comparisons, and short-term outcome pressure. As a consequence, good performance on standard benchmarks does not guarantee success in real-world scenarios. To address these problems, we present Touchstone, a large-scale collaborative segmentation benchmark of 9 types of abdominal organs. This benchmark is based on 5,195 training CT scans from 76 hospitals around the world and 5,903 testing CT scans from 11 additional hospitals. This diverse test set enhances the statistical significance of benchmark results and rigorously evaluates AI algorithms across various out-of-distribution scenarios. We invited 14 inventors of 19 AI algorithms to train their algorithms, while our team, as a third party, independently evaluated these algorithms on three test sets. In addition, we also evaluated pre-existing AI frameworks--which, differing from algorithms, are more flexible and can support different algorithms--including MONAI from NVIDIA, nnU-Net from DKFZ, and numerous other open-source frameworks. We are committed to expanding this benchmark to encourage more innovation of AI algorithms for the medical domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03670v1-abstract-full').style.display = 'none'; document.getElementById('2411.03670v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02461">arXiv:2411.02461</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02461">pdf</a>, <a href="https://arxiv.org/format/2411.02461">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Multiple Dimensions of Trustworthiness in LLMs via Sparse Activation Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Y">Yuxin Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+C">Chaoqun Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yonggang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+X">Xu Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02461v1-abstract-short" style="display: inline;"> As the development and application of Large Language Models (LLMs) continue to advance rapidly, enhancing their trustworthiness and aligning them with human preferences has become a critical area of research. Traditional methods rely heavily on extensive data for Reinforcement Learning from Human Feedback (RLHF), but representation engineering offers a new, training-free approach. This technique l&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02461v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02461v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02461v1-abstract-full" style="display: none;"> As the development and application of Large Language Models (LLMs) continue to advance rapidly, enhancing their trustworthiness and aligning them with human preferences has become a critical area of research. Traditional methods rely heavily on extensive data for Reinforcement Learning from Human Feedback (RLHF), but representation engineering offers a new, training-free approach. This technique leverages semantic features to control the representation of LLM&#39;s intermediate hidden states, enabling the model to meet specific requirements such as increased honesty or heightened safety awareness. However, a significant challenge arises when attempting to fulfill multiple requirements simultaneously. It proves difficult to encode various semantic contents, like honesty and safety, into a singular semantic feature, restricting its practicality. In this work, we address this issue through ``Sparse Activation Control&#39;&#39;. By delving into the intrinsic mechanisms of LLMs, we manage to identify and pinpoint components that are closely related to specific tasks within the model, i.e., attention heads. These heads display sparse characteristics that allow for near-independent control over different tasks. Our experiments, conducted on the open-source Llama series models, have yielded encouraging results. The models were able to align with human preferences on issues of safety, factuality, and bias concurrently. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02461v1-abstract-full').style.display = 'none'; document.getElementById('2411.02461v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02120">arXiv:2411.02120</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02120">pdf</a>, <a href="https://arxiv.org/format/2411.02120">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Bridge-IF: Learning Inverse Protein Folding with Markov Bridges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yiheng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jialu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qiuyi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+J">Jiahuan Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+M">Mingze Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Mingyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jieping Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02120v1-abstract-short" style="display: inline;"> Inverse protein folding is a fundamental task in computational protein design, which aims to design protein sequences that fold into the desired backbone structures. While the development of machine learning algorithms for this task has seen significant success, the prevailing approaches, which predominantly employ a discriminative formulation, frequently encounter the error accumulation issue and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02120v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02120v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02120v1-abstract-full" style="display: none;"> Inverse protein folding is a fundamental task in computational protein design, which aims to design protein sequences that fold into the desired backbone structures. While the development of machine learning algorithms for this task has seen significant success, the prevailing approaches, which predominantly employ a discriminative formulation, frequently encounter the error accumulation issue and often fail to capture the extensive variety of plausible sequences. To fill these gaps, we propose Bridge-IF, a generative diffusion bridge model for inverse folding, which is designed to learn the probabilistic dependency between the distributions of backbone structures and protein sequences. Specifically, we harness an expressive structure encoder to propose a discrete, informative prior derived from structures, and establish a Markov bridge to connect this prior with native sequences. During the inference stage, Bridge-IF progressively refines the prior sequence, culminating in a more plausible design. Moreover, we introduce a reparameterization perspective on Markov bridge models, from which we derive a simplified loss function that facilitates more effective training. We also modulate protein language models (PLMs) with structural conditions to precisely approximate the Markov bridge process, thereby significantly enhancing generation performance while maintaining parameter-efficient training. Extensive experiments on well-established benchmarks demonstrate that Bridge-IF predominantly surpasses existing baselines in sequence recovery and excels in the design of plausible proteins with high foldability. The code is available at https://github.com/violet-sto/Bridge-IF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02120v1-abstract-full').style.display = 'none'; document.getElementById('2411.02120v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02026">arXiv:2411.02026</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02026">pdf</a>, <a href="https://arxiv.org/format/2411.02026">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CTEFM-VC: Zero-Shot Voice Conversion Based on Content-Aware Timbre Ensemble Modeling and Flow Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yu Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yuguang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jianhao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Hongbin Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lei Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Jianjun Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02026v1-abstract-short" style="display: inline;"> Zero-shot voice conversion (VC) aims to transform the timbre of a source speaker into any previously unseen target speaker, while preserving the original linguistic content. Despite notable progress, attaining a degree of speaker similarity and naturalness on par with ground truth recordings continues to pose great challenge. In this paper, we propose CTEFM-VC, a zero-shot VC framework that levera&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02026v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02026v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02026v1-abstract-full" style="display: none;"> Zero-shot voice conversion (VC) aims to transform the timbre of a source speaker into any previously unseen target speaker, while preserving the original linguistic content. Despite notable progress, attaining a degree of speaker similarity and naturalness on par with ground truth recordings continues to pose great challenge. In this paper, we propose CTEFM-VC, a zero-shot VC framework that leverages Content-aware Timbre Ensemble modeling and Flow Matching. Specifically, CTEFM-VC disentangles utterances into linguistic content and timbre representations, subsequently utilizing a conditional flow matching model and a vocoder to reconstruct the mel-spectrogram and waveform. To enhance its timbre modeling capability and the naturalness of generated speech, we propose a context-aware timbre ensemble modeling approach that adaptively integrates diverse speaker verification embeddings and enables the joint utilization of linguistic and timbre features through a cross-attention module. Experiments show that our CTEFM-VC system surpasses state-of-the-art VC methods in both speaker similarity and naturalness by at least 18.5% and 7.0%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02026v1-abstract-full').style.display = 'none'; document.getElementById('2411.02026v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress; 5 pages;</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01852">arXiv:2411.01852</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01852">pdf</a>, <a href="https://arxiv.org/format/2411.01852">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Auditing Political Exposure Bias: Algorithmic Amplification on Twitter/X Approaching the 2024 U.S. Presidential Election </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jinyi Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Luceri%2C+L">Luca Luceri</a>, <a href="/search/cs?searchtype=author&amp;query=Ferrara%2C+E">Emilio Ferrara</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01852v2-abstract-short" style="display: inline;"> Approximately 50% of tweets in X&#39;s user timelines are personalized recommendations from accounts they do not follow. This raises a critical question: what political content are users exposed to beyond their established networks, and how might this influence democratic discourse online? Due to the black-box nature and constant evolution of social media algorithms, much remains unknown about this as&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01852v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01852v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01852v2-abstract-full" style="display: none;"> Approximately 50% of tweets in X&#39;s user timelines are personalized recommendations from accounts they do not follow. This raises a critical question: what political content are users exposed to beyond their established networks, and how might this influence democratic discourse online? Due to the black-box nature and constant evolution of social media algorithms, much remains unknown about this aspect of users&#39; content exposure, particularly as it pertains to potential biases in algorithmic curation. Prior research has shown that certain political groups and media sources are amplified within users&#39; in-network tweets. However, the extent to which this amplification affects out-of-network recommendations remains unclear. As the 2024 U.S. Election approaches, addressing this question is essential for understanding the influence of algorithms on online political content consumption and its potential impact on users&#39; perspectives. In this paper, we conduct a three-week audit of X&#39;s algorithmic content recommendations using a set of 120 sock-puppet monitoring accounts that capture tweets in their personalized ``For You&#39;&#39; timelines. Our objective is to quantify out-of-network content exposure for right- and left-leaning user profiles and to assess any potential biases in political exposure. Our findings indicate that X&#39;s algorithm skews exposure toward a few high-popularity accounts across all users, with right-leaning users experiencing the highest level of exposure inequality. Both left- and right-leaning users encounter amplified exposure to accounts aligned with their own political views and reduced exposure to opposing viewpoints. Additionally, we observe a right-leaning bias in exposure for new accounts within their default timelines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01852v2-abstract-full').style.display = 'none'; document.getElementById('2411.01852v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">HUMANS Lab -- Working Paper No. 2024.9 -- The 2024 Election Integrity Initiative -- University of Southern California</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01791">arXiv:2411.01791</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01791">pdf</a>, <a href="https://arxiv.org/format/2411.01791">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Minder: Faulty Machine Detection for Large-scale Distributed Model Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Y">Yangtao Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+X">Xiang Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zhuo Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xingjian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Z">Zuquan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+H">Hang Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+G">Gaohong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+F">Fuliang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuguang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Haibin Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jianxi Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+M">Minlan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01791v1-abstract-short" style="display: inline;"> Large-scale distributed model training requires simultaneous training on up to thousands of machines. Faulty machine detection is critical when an unexpected fault occurs in a machine. From our experience, a training task can encounter two faults per day on average, possibly leading to a halt for hours. To address the drawbacks of the time-consuming and labor-intensive manual scrutiny, we propose&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01791v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01791v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01791v1-abstract-full" style="display: none;"> Large-scale distributed model training requires simultaneous training on up to thousands of machines. Faulty machine detection is critical when an unexpected fault occurs in a machine. From our experience, a training task can encounter two faults per day on average, possibly leading to a halt for hours. To address the drawbacks of the time-consuming and labor-intensive manual scrutiny, we propose Minder, an automatic faulty machine detector for distributed training tasks. The key idea of Minder is to automatically and efficiently detect faulty distinctive monitoring metric patterns, which could last for a period before the entire training task comes to a halt. Minder has been deployed in our production environment for over one year, monitoring daily distributed training tasks where each involves up to thousands of machines. In our real-world fault detection scenarios, Minder can accurately and efficiently react to faults within 3.6 seconds on average, with a precision of 0.904 and F1-score of 0.893. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01791v1-abstract-full').style.display = 'none'; document.getElementById('2411.01791v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23875">arXiv:2410.23875</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23875">pdf</a>, <a href="https://arxiv.org/format/2410.23875">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Plan-on-Graph: Self-Correcting Adaptive Planning of Large Language Model on Knowledge Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Liyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+P">Panrong Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Z">Zhongming Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Ying Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jieping Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+H">Hui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23875v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown remarkable reasoning capabilities on complex tasks, but they still suffer from out-of-date knowledge, hallucinations, and opaque decision-making. In contrast, Knowledge Graphs (KGs) can provide explicit and editable knowledge for LLMs to alleviate these issues. Existing paradigm of KG-augmented LLM manually predefines the breadth of exploration space and req&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23875v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23875v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23875v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown remarkable reasoning capabilities on complex tasks, but they still suffer from out-of-date knowledge, hallucinations, and opaque decision-making. In contrast, Knowledge Graphs (KGs) can provide explicit and editable knowledge for LLMs to alleviate these issues. Existing paradigm of KG-augmented LLM manually predefines the breadth of exploration space and requires flawless navigation in KGs. However, this paradigm cannot adaptively explore reasoning paths in KGs based on the question semantics and self-correct erroneous reasoning paths, resulting in a bottleneck in efficiency and effect. To address these limitations, we propose a novel self-correcting adaptive planning paradigm for KG-augmented LLM named Plan-on-Graph (PoG), which first decomposes the question into several sub-objectives and then repeats the process of adaptively exploring reasoning paths, updating memory, and reflecting on the need to self-correct erroneous reasoning paths until arriving at the answer. Specifically, three important mechanisms of Guidance, Memory, and Reflection are designed to work together, to guarantee the adaptive breadth of self-correcting planning for graph reasoning. Finally, extensive experiments on three real-world datasets demonstrate the effectiveness and efficiency of PoG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23875v1-abstract-full').style.display = 'none'; document.getElementById('2410.23875v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23166">arXiv:2410.23166</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23166">pdf</a>, <a href="https://arxiv.org/format/2410.23166">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SciPIP: An LLM-based Scientific Paper Idea Proposer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+L">Lihui Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liye Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yunxiang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Y">Yi Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+L">Liang Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23166v1-abstract-short" style="display: inline;"> The exponential growth of knowledge and the increasing complexity of interdisciplinary research pose significant challenges for researchers, including information overload and difficulties in exploring novel ideas. The advancements in large language models (LLMs), such as GPT-4, have shown great potential in enhancing idea proposals, but how to effectively utilize large models for reasonable idea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23166v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23166v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23166v1-abstract-full" style="display: none;"> The exponential growth of knowledge and the increasing complexity of interdisciplinary research pose significant challenges for researchers, including information overload and difficulties in exploring novel ideas. The advancements in large language models (LLMs), such as GPT-4, have shown great potential in enhancing idea proposals, but how to effectively utilize large models for reasonable idea proposal has not been thoroughly explored. This paper proposes a scientific paper idea proposer (SciPIP). Based on a user-provided research background, SciPIP retrieves helpful papers from a literature database while leveraging the capabilities of LLMs to generate more novel and feasible ideas. To this end, 1) we construct a literature retrieval database, extracting lots of papers&#39; multi-dimension information for fast access. Then, a literature retrieval method based on semantics, entity, and citation co-occurrences is proposed to search relevant literature from multiple aspects based on the user-provided background. 2) After literature retrieval, we introduce dual-path idea proposal strategies, where one path infers solutions from the retrieved literature and the other path generates original ideas through model brainstorming. We then combine the two to achieve a good balance between feasibility and originality. Through extensive experiments on the natural language processing (NLP) field, we demonstrate that SciPIP can retrieve citations similar to those of existing top conference papers and generate many ideas consistent with them. Additionally, we evaluate the originality of other ideas generated by SciPIP using large language models, further validating the effectiveness of our proposed method. The code and the database are released at https://github.com/cheerss/SciPIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23166v1-abstract-full').style.display = 'none'; document.getElementById('2410.23166v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 5 figures, 19 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21316">arXiv:2410.21316</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21316">pdf</a>, <a href="https://arxiv.org/format/2410.21316">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3652892.3700781">10.1145/3652892.3700781 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Deep Optimizer States: Towards Scalable Training of Transformer Models Using Interleaved Offloading </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Maurya%2C+A">Avinash Maurya</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jie Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Rafique%2C+M+M">M. Mustafa Rafique</a>, <a href="/search/cs?searchtype=author&amp;query=Cappello%2C+F">Franck Cappello</a>, <a href="/search/cs?searchtype=author&amp;query=Nicolae%2C+B">Bogdan Nicolae</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21316v1-abstract-short" style="display: inline;"> Transformers and large language models~(LLMs) have seen rapid adoption in all domains. Their sizes have exploded to hundreds of billions of parameters and keep increasing. Under these circumstances, the training of transformers is very expensive and often hits a ``memory wall&#39;&#39;, i.e., even when using 3D parallelism (pipeline, tensor, data) and aggregating the memory of many GPUs, it is still not e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21316v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21316v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21316v1-abstract-full" style="display: none;"> Transformers and large language models~(LLMs) have seen rapid adoption in all domains. Their sizes have exploded to hundreds of billions of parameters and keep increasing. Under these circumstances, the training of transformers is very expensive and often hits a ``memory wall&#39;&#39;, i.e., even when using 3D parallelism (pipeline, tensor, data) and aggregating the memory of many GPUs, it is still not enough to hold the necessary data structures (model parameters, optimizer state, gradients, activations) in GPU memory. To compensate, state-of-the-art approaches offload the optimizer state, at least partially, to the host memory and perform hybrid CPU-GPU computations. However, the management of the combined host-GPU memory is often suboptimal and results in poor overlapping between data movements and computations. This leads to missed opportunities to simultaneously leverage the interconnect bandwidth and computational capabilities of CPUs and GPUs. In this paper, we leverage a key observation that the interleaving of the forward, backward and update phases generate fluctuations in the GPU memory utilization, which can be exploited to dynamically move a part of the optimizer state between the host and the GPU memory at each iteration. To this end, we design and implement \proj, a novel technique to split the LLM into subgroups, whose update phase is scheduled on either the CPU or the GPU based on our proposed performance model that addresses the trade-off between data movement cost, acceleration on the GPUs vs the CPUs, and competition for shared resources. We integrate our approach with DeepSpeed and demonstrate 2.5$\times$ faster iterations over state-of-the-art approaches using extensive experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21316v1-abstract-full').style.display = 'none'; document.getElementById('2410.21316v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21259">arXiv:2410.21259</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21259">pdf</a>, <a href="https://arxiv.org/format/2410.21259">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AutoBench-V: Can Large Vision-Language Models Benchmark Themselves? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bao%2C+H">Han Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yanbo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jiayi Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiangqi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiuying Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Elhoseiny%2C+M">Mohamed Elhoseiny</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiangliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21259v2-abstract-short" style="display: inline;"> Large Vision-Language Models (LVLMs) have become essential for advancing the integration of visual and linguistic information, facilitating a wide range of complex applications and tasks. However, the evaluation of LVLMs presents significant challenges as the evaluation benchmark always demands lots of human cost for its construction, and remains static, lacking flexibility once constructed. Even&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21259v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21259v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21259v2-abstract-full" style="display: none;"> Large Vision-Language Models (LVLMs) have become essential for advancing the integration of visual and linguistic information, facilitating a wide range of complex applications and tasks. However, the evaluation of LVLMs presents significant challenges as the evaluation benchmark always demands lots of human cost for its construction, and remains static, lacking flexibility once constructed. Even though automatic evaluation has been explored in textual modality, the visual modality remains under-explored. As a result, in this work, we address a question: &#34;Can LVLMs serve as a path to automatic benchmarking?&#34;. We introduce AutoBench-V, an automated framework for serving evaluation on demand, i.e., benchmarking LVLMs based on specific aspects of model capability. Upon receiving an evaluation capability, AutoBench-V leverages text-to-image models to generate relevant image samples and then utilizes LVLMs to orchestrate visual question-answering (VQA) tasks, completing the evaluation process efficiently and flexibly. Through an extensive evaluation of seven popular LVLMs across five demanded user inputs (i.e., evaluation capabilities), the framework shows effectiveness and reliability. We observe the following: (1) Our constructed benchmark accurately reflects varying task difficulties; (2) As task difficulty rises, the performance gap between models widens; (3) While models exhibit strong performance in abstract level understanding, they underperform in details reasoning tasks; and (4) Constructing a dataset with varying levels of difficulties is critical for a comprehensive and exhaustive evaluation. Overall, AutoBench-V not only successfully utilizes LVLMs for automated benchmarking but also reveals that LVLMs as judges have significant potential in various domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21259v2-abstract-full').style.display = 'none'; document.getElementById('2410.21259v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18921">arXiv:2410.18921</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18921">pdf</a>, <a href="https://arxiv.org/format/2410.18921">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> From Blind Solvers to Logical Thinkers: Benchmarking LLMs&#39; Logical Integrity on Faulty Mathematical Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rahman%2C+A+M+M">A M Muntasir Rahman</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Junyi Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+W">Wei Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+W">Wenpeng Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guiling Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18921v1-abstract-short" style="display: inline;"> Consider the math problem: &#34;Lily received 3 cookies from her best friend yesterday and ate 5 for breakfast. Today, her friend gave her 3 more cookies. How many cookies does Lily have now?&#34; Many large language models (LLMs) in previous research approach this problem by calculating the answer &#34;1&#34; using the equation &#34;3 - 5 + 3.&#34; However, from a human perspective, we recognize the inherent flaw in thi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18921v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18921v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18921v1-abstract-full" style="display: none;"> Consider the math problem: &#34;Lily received 3 cookies from her best friend yesterday and ate 5 for breakfast. Today, her friend gave her 3 more cookies. How many cookies does Lily have now?&#34; Many large language models (LLMs) in previous research approach this problem by calculating the answer &#34;1&#34; using the equation &#34;3 - 5 + 3.&#34; However, from a human perspective, we recognize the inherent flaw in this problem: Lily cannot eat 5 cookies if she initially only had 3. This discrepancy prompts a key question: Are current LLMs merely Blind Solver that apply mathematical operations without deeper reasoning, or can they function as Logical Thinker capable of identifying logical inconsistencies? To explore this question, we propose a benchmark dataset, FaultyMath, which includes faulty math problems of rich diversity: i) multiple mathematical categories, e.g., algebra, geometry, number theory, etc., ii) varying levels of difficulty, and iii) different origins of faultiness -- ranging from violations of common sense and ambiguous statements to mathematical contradictions and more. We evaluate a broad spectrum of LLMs, including open-source, closed-source, and math-specialized models, using FaultyMath across three dimensions: (i) How accurately can the models detect faulty math problems without being explicitly prompted to do so? (ii) When provided with hints -- either correct or misleading -- about the validity of the problems, to what extent do LLMs adapt to become reliable Logical Thinker? (iii) How trustworthy are the explanations generated by LLMs when they recognize a math problem as flawed? Through extensive experimentation and detailed analysis, our results demonstrate that existing LLMs largely function as Blind Solver and fall short of the reasoning capabilities required to perform as Logical Thinker. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18921v1-abstract-full').style.display = 'none'; document.getElementById('2410.18921v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18808">arXiv:2410.18808</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18808">pdf</a>, <a href="https://arxiv.org/format/2410.18808">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Delving into the Reversal Curse: How Far Can Large Language Models Generalize? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhengkai Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Z">Zhihang Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+K">Kai Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+L">Liang Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+D">Deng Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yue Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18808v1-abstract-short" style="display: inline;"> While large language models (LLMs) showcase unprecedented capabilities, they also exhibit certain inherent limitations when facing seemingly trivial tasks. A prime example is the recently debated &#34;reversal curse&#34;, which surfaces when models, having been trained on the fact &#34;A is B&#34;, struggle to generalize this knowledge to infer that &#34;B is A&#34;. In this paper, we examine the manifestation of the rev&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18808v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18808v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18808v1-abstract-full" style="display: none;"> While large language models (LLMs) showcase unprecedented capabilities, they also exhibit certain inherent limitations when facing seemingly trivial tasks. A prime example is the recently debated &#34;reversal curse&#34;, which surfaces when models, having been trained on the fact &#34;A is B&#34;, struggle to generalize this knowledge to infer that &#34;B is A&#34;. In this paper, we examine the manifestation of the reversal curse across various tasks and delve into both the generalization abilities and the problem-solving mechanisms of LLMs. This investigation leads to a series of significant insights: (1) LLMs are able to generalize to &#34;B is A&#34; when both A and B are presented in the context as in the case of a multiple-choice question. (2) This generalization ability is highly correlated to the structure of the fact &#34;A is B&#34; in the training documents. For example, this generalization only applies to biographies structured in &#34;[Name] is [Description]&#34; but not to &#34;[Description] is [Name]&#34;. (3) We propose and verify the hypothesis that LLMs possess an inherent bias in fact recalling during knowledge application, which explains and underscores the importance of the document structure to successful learning. (4) The negative impact of this bias on the downstream performance of LLMs can hardly be mitigated through training alone. Based on these intriguing findings, our work not only presents a novel perspective for interpreting LLMs&#39; generalization abilities from their intrinsic working mechanism but also provides new insights for the development of more effective learning methods for LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18808v1-abstract-full').style.display = 'none'; document.getElementById('2410.18808v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18336">arXiv:2410.18336</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18336">pdf</a>, <a href="https://arxiv.org/format/2410.18336">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Assessing the Creativity of LLMs in Proposing Novel Solutions to Mathematical Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Junyi Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jingyi Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+X">Xinyun Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+W">Wenpeng Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guiling Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18336v1-abstract-short" style="display: inline;"> The mathematical capabilities of AI systems are complex and multifaceted. Most existing research has predominantly focused on the correctness of AI-generated solutions to mathematical problems. In this work, we argue that beyond producing correct answers, AI systems should also be capable of, or assist humans in, developing novel solutions to mathematical challenges. This study explores the creati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18336v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18336v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18336v1-abstract-full" style="display: none;"> The mathematical capabilities of AI systems are complex and multifaceted. Most existing research has predominantly focused on the correctness of AI-generated solutions to mathematical problems. In this work, we argue that beyond producing correct answers, AI systems should also be capable of, or assist humans in, developing novel solutions to mathematical challenges. This study explores the creative potential of Large Language Models (LLMs) in mathematical reasoning, an aspect that has received limited attention in prior research. We introduce a novel framework and benchmark, CreativeMath, which encompasses problems ranging from middle school curricula to Olympic-level competitions, designed to assess LLMs&#39; ability to propose innovative solutions after some known solutions have been provided. Our experiments demonstrate that, while LLMs perform well on standard mathematical tasks, their capacity for creative problem-solving varies considerably. Notably, the Gemini-1.5-Pro model outperformed other LLMs in generating novel solutions. This research opens a new frontier in evaluating AI creativity, shedding light on both the strengths and limitations of LLMs in fostering mathematical innovation, and setting the stage for future developments in AI-assisted mathematical discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18336v1-abstract-full').style.display = 'none'; document.getElementById('2410.18336v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17891">arXiv:2410.17891</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.17891">pdf</a>, <a href="https://arxiv.org/format/2410.17891">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Scaling Diffusion Language Models via Adaptation from Autoregressive Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gong%2C+S">Shansan Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Agarwal%2C+S">Shivam Agarwal</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yizhe Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jiacheng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+L">Lin Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Mukai Li</a>, <a href="/search/cs?searchtype=author&amp;query=An%2C+C">Chenxin An</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+P">Peilin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Bi%2C+W">Wei Bi</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+J">Jiawei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+L">Lingpeng Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17891v1-abstract-short" style="display: inline;"> Diffusion Language Models (DLMs) have emerged as a promising new paradigm for text generative modeling, potentially addressing limitations of autoregressive (AR) models. However, current DLMs have been studied at a smaller scale compared to their AR counterparts and lack fair comparison on language modeling benchmarks. Additionally, training diffusion models from scratch at scale remains challengi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17891v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17891v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17891v1-abstract-full" style="display: none;"> Diffusion Language Models (DLMs) have emerged as a promising new paradigm for text generative modeling, potentially addressing limitations of autoregressive (AR) models. However, current DLMs have been studied at a smaller scale compared to their AR counterparts and lack fair comparison on language modeling benchmarks. Additionally, training diffusion models from scratch at scale remains challenging. Given the prevalence of open-source AR language models, we propose adapting these models to build text diffusion models. We demonstrate connections between AR and diffusion modeling objectives and introduce a simple continual pre-training approach for training diffusion models. Through systematic evaluation on language modeling, reasoning, and commonsense benchmarks, we show that we can convert AR models ranging from 127M to 7B parameters (GPT2 and LLaMA) into diffusion models DiffuGPT and DiffuLLaMA, using less than 200B tokens for training. Our experimental results reveal that these models outperform earlier DLMs and are competitive with their AR counterparts. We release a suite of DLMs (with 127M, 355M, and 7B parameters) capable of generating fluent text, performing in-context learning, filling in the middle without prompt re-ordering, and following instructions \url{https://github.com/HKUNLP/DiffuLLaMA}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17891v1-abstract-full').style.display = 'none'; document.getElementById('2410.17891v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages. Code: https://github.com/HKUNLP/DiffuLLaMA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15042">arXiv:2410.15042</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15042">pdf</a>, <a href="https://arxiv.org/format/2410.15042">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Training: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+M">Mengnan Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lihe Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jingwen Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+B">Baocai Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15042v1-abstract-short" style="display: inline;"> Adversarial training (AT) refers to integrating adversarial examples -- inputs altered with imperceptible perturbations that can significantly impact model predictions -- into the training process. Recent studies have demonstrated the effectiveness of AT in improving the robustness of deep neural networks against diverse adversarial attacks. However, a comprehensive overview of these developments&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15042v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15042v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15042v1-abstract-full" style="display: none;"> Adversarial training (AT) refers to integrating adversarial examples -- inputs altered with imperceptible perturbations that can significantly impact model predictions -- into the training process. Recent studies have demonstrated the effectiveness of AT in improving the robustness of deep neural networks against diverse adversarial attacks. However, a comprehensive overview of these developments is still missing. This survey addresses this gap by reviewing a broad range of recent and representative studies. Specifically, we first describe the implementation procedures and practical applications of AT, followed by a comprehensive review of AT techniques from three perspectives: data enhancement, network design, and training configurations. Lastly, we discuss common challenges in AT and propose several promising directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15042v1-abstract-full').style.display = 'none'; document.getElementById('2410.15042v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14283">arXiv:2410.14283</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.14283">pdf</a>, <a href="https://arxiv.org/format/2410.14283">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Takin-ADA: Emotion Controllable Audio-Driven Animation with Canonical and Landmark Loss Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+B">Bin Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Yanzhen Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jianhao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+R">Ruitao Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yuguang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+R">Ruoye Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+P">Pan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Hongbin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14283v1-abstract-short" style="display: inline;"> Existing audio-driven facial animation methods face critical challenges, including expression leakage, ineffective subtle expression transfer, and imprecise audio-driven synchronization. We discovered that these issues stem from limitations in motion representation and the lack of fine-grained control over facial expressions. To address these problems, we present Takin-ADA, a novel two-stage appro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14283v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14283v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14283v1-abstract-full" style="display: none;"> Existing audio-driven facial animation methods face critical challenges, including expression leakage, ineffective subtle expression transfer, and imprecise audio-driven synchronization. We discovered that these issues stem from limitations in motion representation and the lack of fine-grained control over facial expressions. To address these problems, we present Takin-ADA, a novel two-stage approach for real-time audio-driven portrait animation. In the first stage, we introduce a specialized loss function that enhances subtle expression transfer while reducing unwanted expression leakage. The second stage utilizes an advanced audio processing technique to improve lip-sync accuracy. Our method not only generates precise lip movements but also allows flexible control over facial expressions and head motions. Takin-ADA achieves high-resolution (512x512) facial animations at up to 42 FPS on an RTX 4090 GPU, outperforming existing commercial solutions. Extensive experiments demonstrate that our model significantly surpasses previous methods in video quality, facial dynamics realism, and natural head movements, setting a new benchmark in the field of audio-driven facial animation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14283v1-abstract-full').style.display = 'none'; document.getElementById('2410.14283v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14157">arXiv:2410.14157</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.14157">pdf</a>, <a href="https://arxiv.org/format/2410.14157">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Beyond Autoregression: Discrete Diffusion for Complex Reasoning and Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jiacheng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jiahui Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+S">Shansan Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+L">Lin Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+X">Xin Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhenguo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+L">Lingpeng Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14157v1-abstract-short" style="display: inline;"> Autoregressive language models, despite their impressive capabilities, struggle with complex reasoning and long-term planning tasks. We introduce discrete diffusion models as a novel solution to these challenges. Through the lens of subgoal imbalance, we demonstrate how diffusion models effectively learn difficult subgoals that elude autoregressive approaches. We propose Multi-granularity Diffusio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14157v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14157v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14157v1-abstract-full" style="display: none;"> Autoregressive language models, despite their impressive capabilities, struggle with complex reasoning and long-term planning tasks. We introduce discrete diffusion models as a novel solution to these challenges. Through the lens of subgoal imbalance, we demonstrate how diffusion models effectively learn difficult subgoals that elude autoregressive approaches. We propose Multi-granularity Diffusion Modeling (MDM), which prioritizes subgoals based on difficulty during learning. On complex tasks like Countdown, Sudoku, and Boolean Satisfiability Problems, MDM significantly outperforms autoregressive models without using search techniques. For instance, MDM achieves 91.5\% and 100\% accuracy on Countdown and Sudoku, respectively, compared to 45.8\% and 20.7\% for autoregressive models. Our work highlights the potential of diffusion-based approaches in advancing AI capabilities for sophisticated language understanding and problem-solving tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14157v1-abstract-full').style.display = 'none'; document.getElementById('2410.14157v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13830">arXiv:2410.13830</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13830">pdf</a>, <a href="https://arxiv.org/format/2410.13830">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DreamVideo-2: Zero-Shot Subject-Driven Video Customization with Precise Motion Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yujie Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shiwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+H">Hangjie Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+H">Haonan Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+Y">Yutong Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zhizhong Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jiaxin Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yingya Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Shan%2C+H">Hongming Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13830v1-abstract-short" style="display: inline;"> Recent advances in customized video generation have enabled users to create videos tailored to both specific subjects and motion trajectories. However, existing methods often require complicated test-time fine-tuning and struggle with balancing subject learning and motion control, limiting their real-world applications. In this paper, we present DreamVideo-2, a zero-shot video customization framew&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13830v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13830v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13830v1-abstract-full" style="display: none;"> Recent advances in customized video generation have enabled users to create videos tailored to both specific subjects and motion trajectories. However, existing methods often require complicated test-time fine-tuning and struggle with balancing subject learning and motion control, limiting their real-world applications. In this paper, we present DreamVideo-2, a zero-shot video customization framework capable of generating videos with a specific subject and motion trajectory, guided by a single image and a bounding box sequence, respectively, and without the need for test-time fine-tuning. Specifically, we introduce reference attention, which leverages the model&#39;s inherent capabilities for subject learning, and devise a mask-guided motion module to achieve precise motion control by fully utilizing the robust motion signal of box masks derived from bounding boxes. While these two components achieve their intended functions, we empirically observe that motion control tends to dominate over subject learning. To address this, we propose two key designs: 1) the masked reference attention, which integrates a blended latent mask modeling scheme into reference attention to enhance subject representations at the desired positions, and 2) a reweighted diffusion loss, which differentiates the contributions of regions inside and outside the bounding boxes to ensure a balance between subject and motion control. Extensive experimental results on a newly curated dataset demonstrate that DreamVideo-2 outperforms state-of-the-art methods in both subject customization and motion control. The dataset, code, and models will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13830v1-abstract-full').style.display = 'none'; document.getElementById('2410.13830v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://dreamvideo2.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11761">arXiv:2410.11761</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.11761">pdf</a>, <a href="https://arxiv.org/format/2410.11761">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SlideChat: A Large Vision-Language Assistant for Whole-Slide Pathology Image Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+Y">Yuanfeng Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yanjun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Pei%2C+N">Nana Pei</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+R">Rongshan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11761v2-abstract-short" style="display: inline;"> Despite the progress made by multimodal large language models (MLLMs) in computational pathology, they remain limited by a predominant focus on patch-level analysis, missing essential contextual information at the whole-slide level. The lack of large-scale instruction datasets and the gigapixel scale of whole slide images (WSIs) pose significant developmental challenges. In this paper, we present&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11761v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11761v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11761v2-abstract-full" style="display: none;"> Despite the progress made by multimodal large language models (MLLMs) in computational pathology, they remain limited by a predominant focus on patch-level analysis, missing essential contextual information at the whole-slide level. The lack of large-scale instruction datasets and the gigapixel scale of whole slide images (WSIs) pose significant developmental challenges. In this paper, we present SlideChat, the first vision-language assistant capable of understanding gigapixel whole-slide images, exhibiting excellent multimodal conversational capability and response complex instruction across diverse pathology scenarios. To support its development, we created SlideInstruction, the largest instruction-following dataset for WSIs consisting of 4.2K WSI captions and 176K VQA pairs with multiple categories. Furthermore, we propose SlideBench, a multimodal benchmark that incorporates captioning and VQA tasks to assess SlideChat&#39;s capabilities in varied clinical settings such as microscopy, diagnosis. Compared to both general and specialized MLLMs, SlideChat exhibits exceptional capabilities achieving state-of-the-art performance on 18 of 22 tasks. For example, it achieved an overall accuracy of 81.17% on SlideBench-VQA (TCGA), and 54.15% on SlideBench-VQA (BCNB). We will fully release SlideChat, SlideInstruction and SlideBench as open-source resources to facilitate research and development in computational pathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11761v2-abstract-full').style.display = 'none'; document.getElementById('2410.11761v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09732">arXiv:2410.09732</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09732">pdf</a>, <a href="https://arxiv.org/format/2410.09732">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LOKI: A Comprehensive Synthetic Data Detection Benchmark using Large Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Junyan Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Baichuan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zilong Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Junan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+T">Tianyi Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+H">Hengrui Kang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jun He</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Honglin Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+T">Tong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhizheng Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yiping Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Weijia Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09732v1-abstract-short" style="display: inline;"> With the rapid development of AI-generated content, the future internet may be inundated with synthetic data, making the discrimination of authentic and credible multimodal data increasingly challenging. Synthetic data detection has thus garnered widespread attention, and the performance of large multimodal models (LMMs) in this task has attracted significant interest. LMMs can provide natural lan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09732v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09732v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09732v1-abstract-full" style="display: none;"> With the rapid development of AI-generated content, the future internet may be inundated with synthetic data, making the discrimination of authentic and credible multimodal data increasingly challenging. Synthetic data detection has thus garnered widespread attention, and the performance of large multimodal models (LMMs) in this task has attracted significant interest. LMMs can provide natural language explanations for their authenticity judgments, enhancing the explainability of synthetic content detection. Simultaneously, the task of distinguishing between real and synthetic data effectively tests the perception, knowledge, and reasoning capabilities of LMMs. In response, we introduce LOKI, a novel benchmark designed to evaluate the ability of LMMs to detect synthetic data across multiple modalities. LOKI encompasses video, image, 3D, text, and audio modalities, comprising 18K carefully curated questions across 26 subcategories with clear difficulty levels. The benchmark includes coarse-grained judgment and multiple-choice questions, as well as fine-grained anomaly selection and explanation tasks, allowing for a comprehensive analysis of LMMs. We evaluated 22 open-source LMMs and 6 closed-source models on LOKI, highlighting their potential as synthetic data detectors and also revealing some limitations in the development of LMM capabilities. More information about LOKI can be found at https://opendatalab.github.io/LOKI/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09732v1-abstract-full').style.display = 'none'; document.getElementById('2410.09732v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">79 pages, 63 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09321">arXiv:2410.09321</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09321">pdf</a>, <a href="https://arxiv.org/ps/2410.09321">ps</a>, <a href="https://arxiv.org/format/2410.09321">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Simultaneously Approximating All Norms for Massively Parallel Correlation Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cao%2C+N">Nairen Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jia Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09321v2-abstract-short" style="display: inline;"> We revisit the simultaneous approximation model for the correlation clustering problem introduced by Davies, Moseley, and Newman[DMN24]. The objective is to find a clustering that minimizes given norms of the disagreement vector over all vertices. We present an efficient algorithm that produces a clustering that is simultaneously a $63.3$-approximation for all monotone symmetric norms. This sign&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09321v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09321v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09321v2-abstract-full" style="display: none;"> We revisit the simultaneous approximation model for the correlation clustering problem introduced by Davies, Moseley, and Newman[DMN24]. The objective is to find a clustering that minimizes given norms of the disagreement vector over all vertices. We present an efficient algorithm that produces a clustering that is simultaneously a $63.3$-approximation for all monotone symmetric norms. This significantly improves upon the previous approximation ratio of $6348$ due to Davies, Moseley, and Newman[DMN24], which works only for $\ell_p$-norms. To achieve this result, we first reduce the problem to approximating all top-$k$ norms simultaneously, using the connection between monotone symmetric norms and top-$k$ norms established by Chakrabarty and Swamy [CS19]. Then we develop a novel procedure that constructs a $12.66$-approximate fractional clustering for all top-$k$ norms. Our $63.3$-approximation ratio is obtained by combining this with the $5$-approximate rounding algorithm by Kalhan, Makarychev, and Zhou[KMZ19]. We then demonstrate that with a loss of $蔚$ in the approximation ratio, the algorithm can be adapted to run in nearly linear time and in the MPC (massively parallel computation) model with poly-logarithmic number of rounds. By allowing a further trade-off in the approximation ratio to $(359+蔚)$, the number of MPC rounds can be reduced to a constant. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09321v2-abstract-full').style.display = 'none'; document.getElementById('2410.09321v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07838">arXiv:2410.07838</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07838">pdf</a>, <a href="https://arxiv.org/format/2410.07838">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MinorityPrompt: Text to Minority Image Generation via Prompt Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Um%2C+S">Soobin Um</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J+C">Jong Chul Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07838v1-abstract-short" style="display: inline;"> We investigate the generation of minority samples using pretrained text-to-image (T2I) latent diffusion models. Minority instances, in the context of T2I generation, can be defined as ones living on low-density regions of text-conditional data distributions. They are valuable for various applications of modern T2I generators, such as data augmentation and creative AI. Unfortunately, existing pretr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07838v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07838v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07838v1-abstract-full" style="display: none;"> We investigate the generation of minority samples using pretrained text-to-image (T2I) latent diffusion models. Minority instances, in the context of T2I generation, can be defined as ones living on low-density regions of text-conditional data distributions. They are valuable for various applications of modern T2I generators, such as data augmentation and creative AI. Unfortunately, existing pretrained T2I diffusion models primarily focus on high-density regions, largely due to the influence of guided samplers (like CFG) that are essential for producing high-quality generations. To address this, we present a novel framework to counter the high-density-focus of T2I diffusion models. Specifically, we first develop an online prompt optimization framework that can encourage the emergence of desired properties during inference while preserving semantic contents of user-provided prompts. We subsequently tailor this generic prompt optimizer into a specialized solver that promotes the generation of minority features by incorporating a carefully-crafted likelihood objective. Our comprehensive experiments, conducted across various types of T2I models, demonstrate that our approach significantly enhances the capability to produce high-quality minority instances compared to existing samplers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07838v1-abstract-full').style.display = 'none'; document.getElementById('2410.07838v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07815">arXiv:2410.07815</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07815">pdf</a>, <a href="https://arxiv.org/format/2410.07815">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Simple ReFlow: Improved Techniques for Fast Flow Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+B">Beomsu Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Hsieh%2C+Y">Yu-Guan Hsieh</a>, <a href="/search/cs?searchtype=author&amp;query=Klein%2C+M">Michal Klein</a>, <a href="/search/cs?searchtype=author&amp;query=Cuturi%2C+M">Marco Cuturi</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J+C">Jong Chul Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Kawar%2C+B">Bahjat Kawar</a>, <a href="/search/cs?searchtype=author&amp;query=Thornton%2C+J">James Thornton</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07815v1-abstract-short" style="display: inline;"> Diffusion and flow-matching models achieve remarkable generative performance but at the cost of many sampling steps, this slows inference and limits applicability to time-critical tasks. The ReFlow procedure can accelerate sampling by straightening generation trajectories. However, ReFlow is an iterative procedure, typically requiring training on simulated data, and results in reduced sample quali&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07815v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07815v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07815v1-abstract-full" style="display: none;"> Diffusion and flow-matching models achieve remarkable generative performance but at the cost of many sampling steps, this slows inference and limits applicability to time-critical tasks. The ReFlow procedure can accelerate sampling by straightening generation trajectories. However, ReFlow is an iterative procedure, typically requiring training on simulated data, and results in reduced sample quality. To mitigate sample deterioration, we examine the design space of ReFlow and highlight potential pitfalls in prior heuristic practices. We then propose seven improvements for training dynamics, learning and inference, which are verified with thorough ablation studies on CIFAR10 $32 \times 32$, AFHQv2 $64 \times 64$, and FFHQ $64 \times 64$. Combining all our techniques, we achieve state-of-the-art FID scores (without / with guidance, resp.) for fast generation via neural ODEs: $2.23$ / $1.98$ on CIFAR10, $2.30$ / $1.91$ on AFHQv2, $2.84$ / $2.67$ on FFHQ, and $3.49$ / $1.74$ on ImageNet-64, all with merely $9$ neural function evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07815v1-abstract-full').style.display = 'none'; document.getElementById('2410.07815v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07579">arXiv:2410.07579</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07579">pdf</a>, <a href="https://arxiv.org/format/2410.07579">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Teddy: Efficient Large-Scale Dataset Distillation via Taylor-Approximated Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+R">Ruonan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Songhua Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jingwen Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07579v1-abstract-short" style="display: inline;"> Dataset distillation or condensation refers to compressing a large-scale dataset into a much smaller one, enabling models trained on this synthetic dataset to generalize effectively on real data. Tackling this challenge, as defined, relies on a bi-level optimization algorithm: a novel model is trained in each iteration within a nested loop, with gradients propagated through an unrolled computation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07579v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07579v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07579v1-abstract-full" style="display: none;"> Dataset distillation or condensation refers to compressing a large-scale dataset into a much smaller one, enabling models trained on this synthetic dataset to generalize effectively on real data. Tackling this challenge, as defined, relies on a bi-level optimization algorithm: a novel model is trained in each iteration within a nested loop, with gradients propagated through an unrolled computation graph. However, this approach incurs high memory and time complexity, posing difficulties in scaling up to large datasets such as ImageNet. Addressing these concerns, this paper introduces Teddy, a Taylor-approximated dataset distillation framework designed to handle large-scale dataset and enhance efficiency. On the one hand, backed up by theoretical analysis, we propose a memory-efficient approximation derived from Taylor expansion, which transforms the original form dependent on multi-step gradients to a first-order one. On the other hand, rather than repeatedly training a novel model in each iteration, we unveil that employing a pre-cached pool of weak models, which can be generated from a single base model, enhances both time efficiency and performance concurrently, particularly when dealing with large-scale datasets. Extensive experiments demonstrate that the proposed Teddy attains state-of-the-art efficiency and performance on the Tiny-ImageNet and original-sized ImageNet-1K dataset, notably surpassing prior methods by up to 12.8%, while reducing 46.6% runtime. Our code will be available at https://github.com/Lexie-YU/Teddy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07579v1-abstract-full').style.display = 'none'; document.getElementById('2410.07579v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06055">arXiv:2410.06055</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06055">pdf</a>, <a href="https://arxiv.org/format/2410.06055">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AP-LDM: Attentive and Progressive Latent Diffusion Model for Training-Free High-Resolution Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cao%2C+B">Boyuan Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jiaxin Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yujie Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Shan%2C+H">Hongming Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06055v1-abstract-short" style="display: inline;"> Latent diffusion models (LDMs), such as Stable Diffusion, often experience significant structural distortions when directly generating high-resolution (HR) images that exceed their original training resolutions. A straightforward and cost-effective solution is to adapt pre-trained LDMs for HR image generation; however, existing methods often suffer from poor image quality and long inference time.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06055v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06055v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06055v1-abstract-full" style="display: none;"> Latent diffusion models (LDMs), such as Stable Diffusion, often experience significant structural distortions when directly generating high-resolution (HR) images that exceed their original training resolutions. A straightforward and cost-effective solution is to adapt pre-trained LDMs for HR image generation; however, existing methods often suffer from poor image quality and long inference time. In this paper, we propose an Attentive and Progressive LDM (AP-LDM), a novel, training-free framework aimed at enhancing HR image quality while accelerating the generation process. AP-LDM decomposes the denoising process of LDMs into two stages: (i) attentive training-resolution denoising, and (ii) progressive high-resolution denoising. The first stage generates a latent representation of a higher-quality training-resolution image through the proposed attentive guidance, which utilizes a novel parameter-free self-attention mechanism to enhance the structural consistency. The second stage progressively performs upsampling in pixel space, alleviating the severe artifacts caused by latent space upsampling. Leveraging the effective initialization from the first stage enables denoising at higher resolutions with significantly fewer steps, enhancing overall efficiency. Extensive experimental results demonstrate that AP-LDM significantly outperforms state-of-the-art methods, delivering up to a 5x speedup in HR image generation, thereby highlighting its substantial advantages for real-world applications. Code is available at https://github.com/kmittle/AP-LDM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06055v1-abstract-full').style.display = 'none'; document.getElementById('2410.06055v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05651">arXiv:2410.05651</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05651">pdf</a>, <a href="https://arxiv.org/format/2410.05651">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ViBiDSampler: Enhancing Video Interpolation Using Bidirectional Diffusion Sampler </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Serin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Kwon%2C+T">Taesung Kwon</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J+C">Jong Chul Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05651v1-abstract-short" style="display: inline;"> Recent progress in large-scale text-to-video (T2V) and image-to-video (I2V) diffusion models has greatly enhanced video generation, especially in terms of keyframe interpolation. However, current image-to-video diffusion models, while powerful in generating videos from a single conditioning frame, need adaptation for two-frame (start &amp; end) conditioned generation, which is essential for effective&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05651v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05651v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05651v1-abstract-full" style="display: none;"> Recent progress in large-scale text-to-video (T2V) and image-to-video (I2V) diffusion models has greatly enhanced video generation, especially in terms of keyframe interpolation. However, current image-to-video diffusion models, while powerful in generating videos from a single conditioning frame, need adaptation for two-frame (start &amp; end) conditioned generation, which is essential for effective bounded interpolation. Unfortunately, existing approaches that fuse temporally forward and backward paths in parallel often suffer from off-manifold issues, leading to artifacts or requiring multiple iterative re-noising steps. In this work, we introduce a novel, bidirectional sampling strategy to address these off-manifold issues without requiring extensive re-noising or fine-tuning. Our method employs sequential sampling along both forward and backward paths, conditioned on the start and end frames, respectively, ensuring more coherent and on-manifold generation of intermediate frames. Additionally, we incorporate advanced guidance techniques, CFG++ and DDS, to further enhance the interpolation process. By integrating these, our method achieves state-of-the-art performance, efficiently generating high-quality, smooth videos between keyframes. On a single 3090 GPU, our method can interpolate 25 frames at 1024 x 576 resolution in just 195 seconds, establishing it as a leading solution for keyframe interpolation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05651v1-abstract-full').style.display = 'none'; document.getElementById('2410.05651v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://vibid.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05591">arXiv:2410.05591</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05591">pdf</a>, <a href="https://arxiv.org/format/2410.05591">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TweedieMix: Improving Multi-Concept Fusion for Diffusion-based Image/Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kwon%2C+G">Gihyun Kwon</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J+C">Jong Chul Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05591v1-abstract-short" style="display: inline;"> Despite significant advancements in customizing text-to-image and video generation models, generating images and videos that effectively integrate multiple personalized concepts remains a challenging task. To address this, we present TweedieMix, a novel method for composing customized diffusion models during the inference phase. By analyzing the properties of reverse diffusion sampling, our approa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05591v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05591v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05591v1-abstract-full" style="display: none;"> Despite significant advancements in customizing text-to-image and video generation models, generating images and videos that effectively integrate multiple personalized concepts remains a challenging task. To address this, we present TweedieMix, a novel method for composing customized diffusion models during the inference phase. By analyzing the properties of reverse diffusion sampling, our approach divides the sampling process into two stages. During the initial steps, we apply a multiple object-aware sampling technique to ensure the inclusion of the desired target objects. In the later steps, we blend the appearances of the custom concepts in the de-noised image space using Tweedie&#39;s formula. Our results demonstrate that TweedieMix can generate multiple personalized concepts with higher fidelity than existing methods. Moreover, our framework can be effortlessly extended to image-to-video diffusion models, enabling the generation of videos that feature multiple personalized concepts. Results and source code are in our anonymous project page. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05591v1-abstract-full').style.display = 'none'; document.getElementById('2410.05591v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Github Page: https://github.com/KwonGihyun/TweedieMix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05346">arXiv:2410.05346</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05346">pdf</a>, <a href="https://arxiv.org/format/2410.05346">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AnyAttack: Towards Large-scale Self-supervised Generation of Targeted Adversarial Examples for Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Junhong Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yige Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yunfan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Sang%2C+J">Jitao Sang</a>, <a href="/search/cs?searchtype=author&amp;query=Yeung%2C+D">Dit-Yan Yeung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05346v1-abstract-short" style="display: inline;"> Due to their multimodal capabilities, Vision-Language Models (VLMs) have found numerous impactful applications in real-world scenarios. However, recent studies have revealed that VLMs are vulnerable to image-based adversarial attacks, particularly targeted adversarial images that manipulate the model to generate harmful content specified by the adversary. Current attack methods rely on predefined&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05346v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05346v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05346v1-abstract-full" style="display: none;"> Due to their multimodal capabilities, Vision-Language Models (VLMs) have found numerous impactful applications in real-world scenarios. However, recent studies have revealed that VLMs are vulnerable to image-based adversarial attacks, particularly targeted adversarial images that manipulate the model to generate harmful content specified by the adversary. Current attack methods rely on predefined target labels to create targeted adversarial attacks, which limits their scalability and applicability for large-scale robustness evaluations. In this paper, we propose AnyAttack, a self-supervised framework that generates targeted adversarial images for VLMs without label supervision, allowing any image to serve as a target for the attack. To address the limitation of existing methods that require label supervision, we introduce a contrastive loss that trains a generator on a large-scale unlabeled image dataset, LAION-400M dataset, for generating targeted adversarial noise. This large-scale pre-training endows our method with powerful transferability across a wide range of VLMs. Extensive experiments on five mainstream open-source VLMs (CLIP, BLIP, BLIP2, InstructBLIP, and MiniGPT-4) across three multimodal tasks (image-text retrieval, multimodal classification, and image captioning) demonstrate the effectiveness of our attack. Additionally, we successfully transfer AnyAttack to multiple commercial VLMs, including Google&#39;s Gemini, Claude&#39;s Sonnet, and Microsoft&#39;s Copilot. These results reveal an unprecedented risk to VLMs, highlighting the need for effective countermeasures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05346v1-abstract-full').style.display = 'none'; document.getElementById('2410.05346v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04721">arXiv:2410.04721</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04721">pdf</a>, <a href="https://arxiv.org/format/2410.04721">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ACDC: Autoregressive Coherent Multimodal Generation using Diffusion Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chung%2C+H">Hyungjin Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+D">Dohun Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J+C">Jong Chul Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04721v1-abstract-short" style="display: inline;"> Autoregressive models (ARMs) and diffusion models (DMs) represent two leading paradigms in generative modeling, each excelling in distinct areas: ARMs in global context modeling and long-sequence generation, and DMs in generating high-quality local contexts, especially for continuous data such as images and short videos. However, ARMs often suffer from exponential error accumulation over long sequ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04721v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04721v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04721v1-abstract-full" style="display: none;"> Autoregressive models (ARMs) and diffusion models (DMs) represent two leading paradigms in generative modeling, each excelling in distinct areas: ARMs in global context modeling and long-sequence generation, and DMs in generating high-quality local contexts, especially for continuous data such as images and short videos. However, ARMs often suffer from exponential error accumulation over long sequences, leading to physically implausible results, while DMs are limited by their local context generation capabilities. In this work, we introduce Autoregressive Coherent multimodal generation with Diffusion Correction (ACDC), a zero-shot approach that combines the strengths of both ARMs and DMs at the inference stage without the need for additional fine-tuning. ACDC leverages ARMs for global context generation and memory-conditioned DMs for local correction, ensuring high-quality outputs by correcting artifacts in generated multimodal tokens. In particular, we propose a memory module based on large language models (LLMs) that dynamically adjusts the conditioning texts for the DMs, preserving crucial global context information. Our experiments on multimodal tasks, including coherent multi-frame story generation and autoregressive video generation, demonstrate that ACDC effectively mitigates the accumulation of errors and significantly enhances the quality of generated outputs, achieving superior performance while remaining agnostic to specific ARM and DM architectures. Project page: https://acdc2025.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04721v1-abstract-full').style.display = 'none'; document.getElementById('2410.04721v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 10 figures. Project page: https://acdc2025.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04364">arXiv:2410.04364</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04364">pdf</a>, <a href="https://arxiv.org/format/2410.04364">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VideoGuide: Improving Video Diffusion Models without Training Through a Teacher&#39;s Guide </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+D">Dohun Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+B+S">Bryan S Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+G+Y">Geon Yeong Park</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J+C">Jong Chul Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04364v2-abstract-short" style="display: inline;"> Text-to-image (T2I) diffusion models have revolutionized visual content creation, but extending these capabilities to text-to-video (T2V) generation remains a challenge, particularly in preserving temporal consistency. Existing methods that aim to improve consistency often cause trade-offs such as reduced imaging quality and impractical computational time. To address these issues we introduce Vide&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04364v2-abstract-full').style.display = 'inline'; document.getElementById('2410.04364v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04364v2-abstract-full" style="display: none;"> Text-to-image (T2I) diffusion models have revolutionized visual content creation, but extending these capabilities to text-to-video (T2V) generation remains a challenge, particularly in preserving temporal consistency. Existing methods that aim to improve consistency often cause trade-offs such as reduced imaging quality and impractical computational time. To address these issues we introduce VideoGuide, a novel framework that enhances the temporal consistency of pretrained T2V models without the need for additional training or fine-tuning. Instead, VideoGuide leverages any pretrained video diffusion model (VDM) or itself as a guide during the early stages of inference, improving temporal quality by interpolating the guiding model&#39;s denoised samples into the sampling model&#39;s denoising process. The proposed method brings about significant improvement in temporal consistency and image fidelity, providing a cost-effective and practical solution that synergizes the strengths of various video diffusion models. Furthermore, we demonstrate prior distillation, revealing that base models can achieve enhanced text coherence by utilizing the superior data prior of the guiding model through the proposed method. Project Page: https://dohunlee1.github.io/videoguide.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04364v2-abstract-full').style.display = 'none'; document.getElementById('2410.04364v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 14 figures, Project Page: https://dohunlee1.github.io/videoguide.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03553">arXiv:2410.03553</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.03553">pdf</a>, <a href="https://arxiv.org/format/2410.03553">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Structure-Enhanced Protein Instruction Tuning: Towards General-Purpose Protein Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Liyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+M">Mingze Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yiheng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+K">Kun Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jieping Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+H">Hui Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03553v2-abstract-short" style="display: inline;"> Proteins, as essential biomolecules, play a central role in biological processes, including metabolic reactions and DNA replication. Accurate prediction of their properties and functions is crucial in biological applications. Recent development of protein language models (pLMs) with supervised fine tuning provides a promising solution to this problem. However, the fine-tuned model is tailored for&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03553v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03553v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03553v2-abstract-full" style="display: none;"> Proteins, as essential biomolecules, play a central role in biological processes, including metabolic reactions and DNA replication. Accurate prediction of their properties and functions is crucial in biological applications. Recent development of protein language models (pLMs) with supervised fine tuning provides a promising solution to this problem. However, the fine-tuned model is tailored for particular downstream prediction task, and achieving general-purpose protein understanding remains a challenge. In this paper, we introduce Structure-Enhanced Protein Instruction Tuning (SEPIT) framework to bridge this gap. Our approach integrates a noval structure-aware module into pLMs to inform them with structural knowledge, and then connects these enhanced pLMs to large language models (LLMs) to generate understanding of proteins. In this framework, we propose a novel two-stage instruction tuning pipeline that first establishes a basic understanding of proteins through caption-based instructions and then refines this understanding using a mixture of experts (MoEs) to learn more complex properties and functional information with the same amount of activated parameters. Moreover, we construct the largest and most comprehensive protein instruction dataset to date, which allows us to train and evaluate the general-purpose protein understanding model. Extensive experimental results on open-ended generation and closed-set answer tasks demonstrate the superior performance of SEPIT over both closed-source general LLMs and open-source LLMs trained with protein knowledge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03553v2-abstract-full').style.display = 'none'; document.getElementById('2410.03553v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02811">arXiv:2410.02811</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02811">pdf</a>, <a href="https://arxiv.org/format/2410.02811">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SAC-KG: Exploiting Large Language Models as Skilled Automatic Constructors for Domain Knowledge Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hanzhu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+X">Xu Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+Q">Qitan Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+X">Xiaoqi Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02811v1-abstract-short" style="display: inline;"> Knowledge graphs (KGs) play a pivotal role in knowledge-intensive tasks across specialized domains, where the acquisition of precise and dependable knowledge is crucial. However, existing KG construction methods heavily rely on human intervention to attain qualified KGs, which severely hinders the practical applicability in real-world scenarios. To address this challenge, we propose a general KG c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02811v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02811v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02811v1-abstract-full" style="display: none;"> Knowledge graphs (KGs) play a pivotal role in knowledge-intensive tasks across specialized domains, where the acquisition of precise and dependable knowledge is crucial. However, existing KG construction methods heavily rely on human intervention to attain qualified KGs, which severely hinders the practical applicability in real-world scenarios. To address this challenge, we propose a general KG construction framework, named SAC-KG, to exploit large language models (LLMs) as Skilled Automatic Constructors for domain Knowledge Graph. SAC-KG effectively involves LLMs as domain experts to generate specialized and precise multi-level KGs. Specifically, SAC-KG consists of three components: Generator, Verifier, and Pruner. For a given entity, Generator produces its relations and tails from raw domain corpora, to construct a specialized single-level KG. Verifier and Pruner then work together to ensure precision by correcting generation errors and determining whether newly produced tails require further iteration for the next-level KG.Experiments demonstrate that SAC-KG automatically constructs a domain KG at the scale of over one million nodes and achieves a precision of 89.32%, leading to a superior performance with over 20% increase in precision rate compared to existing state-of-the-art methods for the KG construction task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02811v1-abstract-full').style.display = 'none'; document.getElementById('2410.02811v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 Main</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02736">arXiv:2410.02736</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02736">pdf</a>, <a href="https://arxiv.org/format/2410.02736">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Justice or Prejudice? Quantifying Biases in LLM-as-a-Judge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jiayi Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yanbo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+D">Dongping Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qihui Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Moniz%2C+N">Nuno Moniz</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+T">Tian Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Geyer%2C+W">Werner Geyer</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Chao Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+P">Pin-Yu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chawla%2C+N+V">Nitesh V Chawla</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiangliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02736v2-abstract-short" style="display: inline;"> LLM-as-a-Judge has been widely utilized as an evaluation method in various benchmarks and served as supervised rewards in model training. However, despite their excellence in many domains, potential issues are under-explored, undermining their reliability and the scope of their utility. Therefore, we identify 12 key potential biases and propose a new automated bias quantification framework-CALM-wh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02736v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02736v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02736v2-abstract-full" style="display: none;"> LLM-as-a-Judge has been widely utilized as an evaluation method in various benchmarks and served as supervised rewards in model training. However, despite their excellence in many domains, potential issues are under-explored, undermining their reliability and the scope of their utility. Therefore, we identify 12 key potential biases and propose a new automated bias quantification framework-CALM-which systematically quantifies and analyzes each type of bias in LLM-as-a-Judge by using automated and principle-guided modification. Our experiments cover multiple popular language models, and the results indicate that while advanced models have achieved commendable overall performance, significant biases persist in certain specific tasks. Empirical results suggest that there remains room for improvement in the reliability of LLM-as-a-Judge. Moreover, we also discuss the explicit and implicit influence of these biases and give some suggestions for the reliable application of LLM-as-a-Judge. Our work highlights the need for stakeholders to address these issues and remind users to exercise caution in LLM-as-a-Judge applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02736v2-abstract-full').style.display = 'none'; document.getElementById('2410.02736v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02330">arXiv:2410.02330</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02330">pdf</a>, <a href="https://arxiv.org/format/2410.02330">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Llama SLayer 8B: Shallow Layers Hold the Key to Knowledge Injection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tianxiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Z">Zhentao Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+T">Tao Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yue Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+Q">Qi Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Bin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jieping Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+N">Nenghai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02330v1-abstract-short" style="display: inline;"> As a manner to augment pre-trained large language models (LLM), knowledge injection is critical to develop vertical domain large models and has been widely studied. Although most current approaches, including parameter-efficient fine-tuning (PEFT) and block expansion methods, uniformly apply knowledge across all LLM layers, it raises the question: are all layers equally crucial for knowledge injec&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02330v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02330v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02330v1-abstract-full" style="display: none;"> As a manner to augment pre-trained large language models (LLM), knowledge injection is critical to develop vertical domain large models and has been widely studied. Although most current approaches, including parameter-efficient fine-tuning (PEFT) and block expansion methods, uniformly apply knowledge across all LLM layers, it raises the question: are all layers equally crucial for knowledge injection? We begin by evaluating the importance of each layer in finding the optimal layer range for knowledge injection. Intuitively, the more important layers should play a more critical role in knowledge injection and deserve a denser injection. We observe performance dips in question-answering benchmarks after the removal or expansion of the shallow layers, and the degradation shrinks as the layer gets deeper, indicating that the shallow layers hold the key to knowledge injection. This insight leads us to propose the S strategy, a post-pretraining strategy of selectively enhancing shallow layers while pruning the less effective deep ones. Based on this strategy, we introduce Llama Slayer-8B and Llama Slayer-8B-Instruct. We experimented on the corpus of code $\&amp;$ math and demonstrated the effectiveness of our strategy. Further experiments across different LLM, Mistral-7B, and a legal corpus confirmed the general applicability of the approach, underscoring its wide-ranging efficacy. Our code is available at: \https://github.com/txchen-USTC/Llama-Slayer <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02330v1-abstract-full').style.display = 'none'; document.getElementById('2410.02330v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01677">arXiv:2410.01677</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01677">pdf</a>, <a href="https://arxiv.org/format/2410.01677">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mind Scramble: Unveiling Large Language Model Psychology Via Typoglycemia </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+M">Miao Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+J">Junyuan Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+G">Guibin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jingheng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+J">Junfeng Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+A">Aoxiao Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Y">Yuxuan Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Q">Qingsong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01677v3-abstract-short" style="display: inline;"> Research into the external behaviors and internal mechanisms of large language models (LLMs) has shown promise in addressing complex tasks in the physical world. Studies suggest that powerful LLMs, like GPT-4, are beginning to exhibit human-like cognitive abilities, including planning, reasoning, and reflection. In this paper, we introduce a research line and methodology called LLM Psychology, lev&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01677v3-abstract-full').style.display = 'inline'; document.getElementById('2410.01677v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01677v3-abstract-full" style="display: none;"> Research into the external behaviors and internal mechanisms of large language models (LLMs) has shown promise in addressing complex tasks in the physical world. Studies suggest that powerful LLMs, like GPT-4, are beginning to exhibit human-like cognitive abilities, including planning, reasoning, and reflection. In this paper, we introduce a research line and methodology called LLM Psychology, leveraging human psychology experiments to investigate the cognitive behaviors and mechanisms of LLMs. We migrate the Typoglycemia phenomenon from psychology to explore the &#34;mind&#34; of LLMs. Unlike human brains, which rely on context and word patterns to comprehend scrambled text, LLMs use distinct encoding and decoding processes. Through Typoglycemia experiments at the character, word, and sentence levels, we observe: (I) LLMs demonstrate human-like behaviors on a macro scale, such as lower task accuracy and higher token/time consumption; (II) LLMs exhibit varying robustness to scrambled input, making Typoglycemia a benchmark for model evaluation without new datasets; (III) Different task types have varying impacts, with complex logical tasks (e.g., math) being more challenging in scrambled form; (IV) Each LLM has a unique and consistent &#34;cognitive pattern&#34; across tasks, revealing general mechanisms in its psychology process. We provide an in-depth analysis of hidden layers to explain these phenomena, paving the way for future research in LLM Psychology and deeper interpretability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01677v3-abstract-full').style.display = 'none'; document.getElementById('2410.01677v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01350">arXiv:2410.01350</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01350">pdf</a>, <a href="https://arxiv.org/format/2410.01350">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Takin-VC: Zero-shot Voice Conversion via Jointly Hybrid Content and Memory-Augmented Context-Aware Timbre Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yuguang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yu Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jianhao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Hongbin Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lei Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Jianjun Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01350v1-abstract-short" style="display: inline;"> Zero-shot voice conversion (VC) aims to transform the source speaker timbre into an arbitrary unseen one without altering the original speech content.While recent advancements in zero-shot VC methods have shown remarkable progress, there still remains considerable potential for improvement in terms of improving speaker similarity and speech naturalness.In this paper, we propose Takin-VC, a novel z&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01350v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01350v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01350v1-abstract-full" style="display: none;"> Zero-shot voice conversion (VC) aims to transform the source speaker timbre into an arbitrary unseen one without altering the original speech content.While recent advancements in zero-shot VC methods have shown remarkable progress, there still remains considerable potential for improvement in terms of improving speaker similarity and speech naturalness.In this paper, we propose Takin-VC, a novel zero-shot VC framework based on jointly hybrid content and memory-augmented context-aware timbre modeling to tackle this challenge. Specifically, an effective hybrid content encoder, guided by neural codec training, that leverages quantized features from pre-trained WavLM and HybridFormer is first presented to extract the linguistic content of the source speech. Subsequently, we introduce an advanced cross-attention-based context-aware timbre modeling approach that learns the fine-grained, semantically associated target timbre features. To further enhance both speaker similarity and real-time performance, we utilize a conditional flow matching model to reconstruct the Mel-spectrogram of the source speech. Additionally, we advocate an efficient memory-augmented module designed to generate high-quality conditional target inputs for the flow matching process, thereby improving the overall performance of the proposed system. Experimental results demonstrate that the proposed Takin-VC method surpasses state-of-the-art zero-shot VC systems, delivering superior performance in terms of both speech naturalness and speaker similarity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01350v1-abstract-full').style.display = 'none'; document.getElementById('2410.01350v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in Progress; Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01105">arXiv:2410.01105</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01105">pdf</a>, <a href="https://arxiv.org/format/2410.01105">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> M2P2: A Multi-Modal Passive Perception Dataset for Off-Road Mobility in Extreme Low-Light Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Datar%2C+A">Aniket Datar</a>, <a href="/search/cs?searchtype=author&amp;query=Pokhrel%2C+A">Anuj Pokhrel</a>, <a href="/search/cs?searchtype=author&amp;query=Nazeri%2C+M">Mohammad Nazeri</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+M+B">Madhan B. Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+C">Chenhui Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yufan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Harrison%2C+A">Andre Harrison</a>, <a href="/search/cs?searchtype=author&amp;query=Wigness%2C+M">Maggie Wigness</a>, <a href="/search/cs?searchtype=author&amp;query=Osteen%2C+P+R">Philip R. Osteen</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jinwei Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+X">Xuesu Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01105v1-abstract-short" style="display: inline;"> Long-duration, off-road, autonomous missions require robots to continuously perceive their surroundings regardless of the ambient lighting conditions. Most existing autonomy systems heavily rely on active sensing, e.g., LiDAR, RADAR, and Time-of-Flight sensors, or use (stereo) visible light imaging sensors, e.g., color cameras, to perceive environment geometry and semantics. In scenarios where ful&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01105v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01105v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01105v1-abstract-full" style="display: none;"> Long-duration, off-road, autonomous missions require robots to continuously perceive their surroundings regardless of the ambient lighting conditions. Most existing autonomy systems heavily rely on active sensing, e.g., LiDAR, RADAR, and Time-of-Flight sensors, or use (stereo) visible light imaging sensors, e.g., color cameras, to perceive environment geometry and semantics. In scenarios where fully passive perception is required and lighting conditions are degraded to an extent that visible light cameras fail to perceive, most downstream mobility tasks such as obstacle avoidance become impossible. To address such a challenge, this paper presents a Multi-Modal Passive Perception dataset, M2P2, to enable off-road mobility in low-light to no-light conditions. We design a multi-modal sensor suite including thermal, event, and stereo RGB cameras, GPS, two Inertia Measurement Units (IMUs), as well as a high-resolution LiDAR for ground truth, with a novel multi-sensor calibration procedure that can efficiently transform multi-modal perceptual streams into a common coordinate system. Our 10-hour, 32 km dataset also includes mobility data such as robot odometry and actions and covers well-lit, low-light, and no-light conditions, along with paved, on-trail, and off-trail terrain. Our results demonstrate that off-road mobility is possible through only passive perception in extreme low-light conditions using end-to-end learning and classical planning. The project website can be found at https://cs.gmu.edu/~xiao/Research/M2P2/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01105v1-abstract-full').style.display = 'none'; document.getElementById('2410.01105v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00132">arXiv:2410.00132</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.00132">pdf</a>, <a href="https://arxiv.org/format/2410.00132">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CVVLSNet: Vehicle Location and Speed Estimation Using Partial Connected Vehicle Trajectory Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jiachen Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Dingyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+S">Shaocheng Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Pei%2C+X">Xin Pei</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+S+C">S. C. Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00132v1-abstract-short" style="display: inline;"> Real-time estimation of vehicle locations and speeds is crucial for developing many beneficial transportation applications in traffic management and control, e.g., adaptive signal control. Recent advances in communication technologies facilitate the emergence of connected vehicles (CVs), which can share traffic information with nearby CVs or infrastructures. At the early stage of connectivity, onl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00132v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00132v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00132v1-abstract-full" style="display: none;"> Real-time estimation of vehicle locations and speeds is crucial for developing many beneficial transportation applications in traffic management and control, e.g., adaptive signal control. Recent advances in communication technologies facilitate the emergence of connected vehicles (CVs), which can share traffic information with nearby CVs or infrastructures. At the early stage of connectivity, only a portion of vehicles are CVs. The locations and speeds for those non-CVs (NCs) are not accessible and must be estimated to obtain the full traffic information. To address the above problem, this paper proposes a novel CV-based Vehicle Location and Speed estimation network, CVVLSNet, to simultaneously estimate the vehicle locations and speeds exclusively using partial CV trajectory data. A road cell occupancy (RCO) method is first proposed to represent the variable vehicle state information. Spatiotemporal interactions can be integrated by simply fusing the RCO representations. Then, CVVLSNet, taking the Coding-RAte TransformEr (CRATE) network as a backbone, is introduced to estimate the vehicle locations and speeds. Moreover, physical vehicle size constraints are also considered in loss functions. Extensive experiments indicate that the proposed method significantly outperformed the existing method under various CV penetration rates, signal timings, and volume-to-capacity ratios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00132v1-abstract-full').style.display = 'none'; document.getElementById('2410.00132v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00083">arXiv:2410.00083</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.00083">pdf</a>, <a href="https://arxiv.org/ps/2410.00083">ps</a>, <a href="https://arxiv.org/format/2410.00083">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Diffusion Models for Inverse Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Daras%2C+G">Giannis Daras</a>, <a href="/search/cs?searchtype=author&amp;query=Chung%2C+H">Hyungjin Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+C">Chieh-Hsin Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Mitsufuji%2C+Y">Yuki Mitsufuji</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J+C">Jong Chul Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Milanfar%2C+P">Peyman Milanfar</a>, <a href="/search/cs?searchtype=author&amp;query=Dimakis%2C+A+G">Alexandros G. Dimakis</a>, <a href="/search/cs?searchtype=author&amp;query=Delbracio%2C+M">Mauricio Delbracio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00083v1-abstract-short" style="display: inline;"> Diffusion models have become increasingly popular for generative modeling due to their ability to generate high-quality samples. This has unlocked exciting new possibilities for solving inverse problems, especially in image restoration and reconstruction, by treating diffusion models as unsupervised priors. This survey provides a comprehensive overview of methods that utilize pre-trained diffusion&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00083v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00083v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00083v1-abstract-full" style="display: none;"> Diffusion models have become increasingly popular for generative modeling due to their ability to generate high-quality samples. This has unlocked exciting new possibilities for solving inverse problems, especially in image restoration and reconstruction, by treating diffusion models as unsupervised priors. This survey provides a comprehensive overview of methods that utilize pre-trained diffusion models to solve inverse problems without requiring further training. We introduce taxonomies to categorize these methods based on both the problems they address and the techniques they employ. We analyze the connections between different approaches, offering insights into their practical implementation and highlighting important considerations. We further discuss specific challenges and potential solutions associated with using latent diffusion models for inverse problems. This work aims to be a valuable resource for those interested in learning about the intersection of diffusion models and inverse problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00083v1-abstract-full').style.display = 'none'; document.getElementById('2410.00083v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. 38 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00046">arXiv:2410.00046</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.00046">pdf</a>, <a href="https://arxiv.org/format/2410.00046">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mixture of Multicenter Experts in Multimodal Generative AI for Advanced Radiotherapy Target Delineation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Oh%2C+Y">Yujin Oh</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+S">Sangjoon Park</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yi%2C+W">Wang Yi</a>, <a href="/search/cs?searchtype=author&amp;query=Paly%2C+J">Jonathan Paly</a>, <a href="/search/cs?searchtype=author&amp;query=Efstathiou%2C+J">Jason Efstathiou</a>, <a href="/search/cs?searchtype=author&amp;query=Chan%2C+A">Annie Chan</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J+W">Jun Won Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Byun%2C+H+K">Hwa Kyung Byun</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+I+J">Ik Jae Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+J">Jaeho Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Wee%2C+C+W">Chan Woo Wee</a>, <a href="/search/cs?searchtype=author&amp;query=Shu%2C+P">Peng Shu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Peilong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+N">Nathan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Holmes%2C+J">Jason Holmes</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J+C">Jong Chul Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Quanzheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Koom%2C+W+S">Woong Sub Koom</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J+S">Jin Sung Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+K">Kyungsang Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00046v2-abstract-short" style="display: inline;"> Clinical experts employ diverse philosophies and strategies in patient care, influenced by regional patient populations. However, existing medical artificial intelligence (AI) models are often trained on data distributions that disproportionately reflect highly prevalent patterns, reinforcing biases and overlooking the diverse expertise of clinicians. To overcome this limitation, we introduce the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00046v2-abstract-full').style.display = 'inline'; document.getElementById('2410.00046v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00046v2-abstract-full" style="display: none;"> Clinical experts employ diverse philosophies and strategies in patient care, influenced by regional patient populations. However, existing medical artificial intelligence (AI) models are often trained on data distributions that disproportionately reflect highly prevalent patterns, reinforcing biases and overlooking the diverse expertise of clinicians. To overcome this limitation, we introduce the Mixture of Multicenter Experts (MoME) approach. This method strategically integrates specialized expertise from diverse clinical strategies, enhancing the AI model&#39;s ability to generalize and adapt across multiple medical centers. The MoME-based multimodal target volume delineation model, trained with few-shot samples including images and clinical notes from each medical center, outperformed baseline methods in prostate cancer radiotherapy target delineation. The advantages of MoME were most pronounced when data characteristics varied across centers or when data availability was limited, demonstrating its potential for broader clinical applications. Therefore, the MoME framework enables the deployment of AI-based target volume delineation models in resource-constrained medical facilities by adapting to specific preferences of each medical center only using a few sample data, without the need for data sharing between institutions. Expanding the number of multicenter experts within the MoME framework will significantly enhance the generalizability, while also improving the usability and adaptability of clinical AI applications in the field of precision radiation oncology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00046v2-abstract-full').style.display = 'none'; document.getElementById('2410.00046v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">39 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20441">arXiv:2409.20441</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.20441">pdf</a>, <a href="https://arxiv.org/format/2409.20441">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Instance-adaptive Zero-shot Chain-of-Thought Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+X">Xiaosong Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+S">Shaotian Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaofeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+L">Liang Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guan%2C+R">Renchu Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Ying Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20441v3-abstract-short" style="display: inline;"> Zero-shot Chain-of-Thought (CoT) prompting emerges as a simple and effective strategy for enhancing the performance of large language models (LLMs) in real-world reasoning tasks. Nonetheless, the efficacy of a singular, task-level prompt uniformly applied across the whole of instances is inherently limited since one prompt cannot be a good partner for all, a more appropriate approach should consid&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20441v3-abstract-full').style.display = 'inline'; document.getElementById('2409.20441v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20441v3-abstract-full" style="display: none;"> Zero-shot Chain-of-Thought (CoT) prompting emerges as a simple and effective strategy for enhancing the performance of large language models (LLMs) in real-world reasoning tasks. Nonetheless, the efficacy of a singular, task-level prompt uniformly applied across the whole of instances is inherently limited since one prompt cannot be a good partner for all, a more appropriate approach should consider the interaction between the prompt and each instance meticulously. This work introduces an instance-adaptive prompting algorithm as an alternative zero-shot CoT reasoning scheme by adaptively differentiating good and bad prompts. Concretely, we first employ analysis on LLMs through the lens of information flow to detect the mechanism under zero-shot CoT reasoning, in which we discover that information flows from question to prompt and question to rationale jointly influence the reasoning results most. We notice that a better zero-shot CoT reasoning needs the prompt to obtain semantic information from the question then the rationale aggregates sufficient information from the question directly and via the prompt indirectly. On the contrary, lacking any of those would probably lead to a bad one. Stem from that, we further propose an instance-adaptive prompting strategy (IAP) for zero-shot CoT reasoning. Experiments conducted with LLaMA-2, LLaMA-3, and Qwen on math, logic, and commonsense reasoning tasks (e.g., GSM8K, MMLU, Causal Judgement) obtain consistent improvement, demonstrating that the instance-adaptive zero-shot CoT prompting performs better than other task-level methods with some curated prompts or sophisticated procedures, showing the significance of our findings in the zero-shot CoT reasoning mechanism. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20441v3-abstract-full').style.display = 'none'; document.getElementById('2409.20441v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17392">arXiv:2409.17392</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.17392">pdf</a>, <a href="https://arxiv.org/format/2409.17392">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Trading and Market Microstructure">q-fin.TR</span> </div> </div> <p class="title is-5 mathjax"> Trading through Earnings Seasons using Self-Supervised Contrastive Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+Z+J">Zhengxin Joseph Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Schuller%2C+B">Bjoern Schuller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17392v1-abstract-short" style="display: inline;"> Earnings release is a key economic event in the financial markets and crucial for predicting stock movements. Earnings data gives a glimpse into how a company is doing financially and can hint at where its stock might go next. However, the irregularity of its release cycle makes it a challenge to incorporate this data in a medium-frequency algorithmic trading model and the usefulness of this data&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17392v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17392v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17392v1-abstract-full" style="display: none;"> Earnings release is a key economic event in the financial markets and crucial for predicting stock movements. Earnings data gives a glimpse into how a company is doing financially and can hint at where its stock might go next. However, the irregularity of its release cycle makes it a challenge to incorporate this data in a medium-frequency algorithmic trading model and the usefulness of this data fades fast after it is released, making it tough for models to stay accurate over time. Addressing this challenge, we introduce the Contrastive Earnings Transformer (CET) model, a self-supervised learning approach rooted in Contrastive Predictive Coding (CPC), aiming to optimise the utilisation of earnings data. To ascertain its effectiveness, we conduct a comparative study of CET against benchmark models across diverse sectors. Our research delves deep into the intricacies of stock data, evaluating how various models, and notably CET, handle the rapidly changing relevance of earnings data over time and over different sectors. The research outcomes shed light on CET&#39;s distinct advantage in extrapolating the inherent value of earnings data over time. Its foundation on CPC allows for a nuanced understanding, facilitating consistent stock predictions even as the earnings data ages. This finding about CET presents a fresh approach to better use earnings data in algorithmic trading for predicting stock price trends. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17392v1-abstract-full').style.display = 'none'; document.getElementById('2409.17392v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15936">arXiv:2409.15936</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.15936">pdf</a>, <a href="https://arxiv.org/format/2409.15936">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> DepMamba: Progressive Fusion Mamba for Multimodal Depression Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jiaxin Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Junping Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Shan%2C+H">Hongming Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15936v1-abstract-short" style="display: inline;"> Depression is a common mental disorder that affects millions of people worldwide. Although promising, current multimodal methods hinge on aligned or aggregated multimodal fusion, suffering two significant limitations: (i) inefficient long-range temporal modeling, and (ii) sub-optimal multimodal fusion between intermodal fusion and intramodal processing. In this paper, we propose an audio-visual pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15936v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15936v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15936v1-abstract-full" style="display: none;"> Depression is a common mental disorder that affects millions of people worldwide. Although promising, current multimodal methods hinge on aligned or aggregated multimodal fusion, suffering two significant limitations: (i) inefficient long-range temporal modeling, and (ii) sub-optimal multimodal fusion between intermodal fusion and intramodal processing. In this paper, we propose an audio-visual progressive fusion Mamba for multimodal depression detection, termed DepMamba. DepMamba features two core designs: hierarchical contextual modeling and progressive multimodal fusion. On the one hand, hierarchical modeling introduces convolution neural networks and Mamba to extract the local-to-global features within long-range sequences. On the other hand, the progressive fusion first presents a multimodal collaborative State Space Model (SSM) extracting intermodal and intramodal information for each modality, and then utilizes a multimodal enhanced SSM for modality cohesion. Extensive experimental results on two large-scale depression datasets demonstrate the superior performance of our DepMamba over existing state-of-the-art methods. Code is available at https://github.com/Jiaxin-Ye/DepMamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15936v1-abstract-full').style.display = 'none'; document.getElementById('2409.15936v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+J&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10