Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 925 results for author: <span class="mathjax">Yuan, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yuan%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yuan, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yuan%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yuan, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16832">arXiv:2502.16832</a> <span> [<a href="https://arxiv.org/pdf/2502.16832">pdf</a>, <a href="https://arxiv.org/format/2502.16832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FedBM: Stealing Knowledge from Pre-trained Language Models for Heterogeneous Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meilu Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qiushi Yang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifan Gao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16832v1-abstract-short" style="display: inline;"> Federated learning (FL) has shown great potential in medical image computing since it provides a decentralized learning paradigm that allows multiple clients to train a model collaboratively without privacy leakage. However, current studies have shown that data heterogeneity incurs local learning bias in classifiers and feature extractors of client models during local training, leading to the perf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16832v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16832v1-abstract-full" style="display: none;"> Federated learning (FL) has shown great potential in medical image computing since it provides a decentralized learning paradigm that allows multiple clients to train a model collaboratively without privacy leakage. However, current studies have shown that data heterogeneity incurs local learning bias in classifiers and feature extractors of client models during local training, leading to the performance degradation of a federation system. To address these issues, we propose a novel framework called Federated Bias eliMinating (FedBM) to get rid of local learning bias in heterogeneous federated learning (FL), which mainly consists of two modules, i.e., Linguistic Knowledge-based Classifier Construction (LKCC) and Concept-guided Global Distribution Estimation (CGDE). Specifically, LKCC exploits class concepts, prompts and pre-trained language models (PLMs) to obtain concept embeddings. These embeddings are used to estimate the latent concept distribution of each class in the linguistic space. Based on the theoretical derivation, we can rely on these distributions to pre-construct a high-quality classifier for clients to achieve classification optimization, which is frozen to avoid classifier bias during local training. CGDE samples probabilistic concept embeddings from the latent concept distributions to learn a conditional generator to capture the input space of the global model. Three regularization terms are introduced to improve the quality and utility of the generator. The generator is shared by all clients and produces pseudo data to calibrate updates of local feature extractors. Extensive comparison experiments and ablation studies on public datasets demonstrate the superior performance of FedBM over state-of-the-arts and confirm the effectiveness of each module, respectively. The code is available at https://github.com/CUHK-AIM-Group/FedBM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16832v1-abstract-full').style.display = 'none'; document.getElementById('2502.16832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by MedIA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16779">arXiv:2502.16779</a> <span> [<a href="https://arxiv.org/pdf/2502.16779">pdf</a>, <a href="https://arxiv.org/format/2502.16779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unposed Sparse Views Room Layout Reconstruction in the Age of Pretrain Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yaxuan Huang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xili Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianan Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xianbiao Qi</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixing Yuan</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiangyu Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16779v1-abstract-short" style="display: inline;"> Room layout estimation from multiple-perspective images is poorly investigated due to the complexities that emerge from multi-view geometry, which requires muti-step solutions such as camera intrinsic and extrinsic estimation, image matching, and triangulation. However, in 3D reconstruction, the advancement of recent 3D foundation models such as DUSt3R has shifted the paradigm from the traditional… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16779v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16779v1-abstract-full" style="display: none;"> Room layout estimation from multiple-perspective images is poorly investigated due to the complexities that emerge from multi-view geometry, which requires muti-step solutions such as camera intrinsic and extrinsic estimation, image matching, and triangulation. However, in 3D reconstruction, the advancement of recent 3D foundation models such as DUSt3R has shifted the paradigm from the traditional multi-step structure-from-motion process to an end-to-end single-step approach. To this end, we introduce Plane-DUSt3R}, a novel method for multi-view room layout estimation leveraging the 3D foundation model DUSt3R. Plane-DUSt3R incorporates the DUSt3R framework and fine-tunes on a room layout dataset (Structure3D) with a modified objective to estimate structural planes. By generating uniform and parsimonious results, Plane-DUSt3R enables room layout estimation with only a single post-processing step and 2D detection results. Unlike previous methods that rely on single-perspective or panorama image, Plane-DUSt3R extends the setting to handle multiple-perspective images. Moreover, it offers a streamlined, end-to-end solution that simplifies the process and reduces error accumulation. Experimental results demonstrate that Plane-DUSt3R not only outperforms state-of-the-art methods on the synthetic dataset but also proves robust and effective on in the wild data with different image styles such as cartoon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16779v1-abstract-full').style.display = 'none'; document.getElementById('2502.16779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16435">arXiv:2502.16435</a> <span> [<a href="https://arxiv.org/pdf/2502.16435">pdf</a>, <a href="https://arxiv.org/format/2502.16435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VisFactor: Benchmarking Fundamental Visual Cognition in Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jen-Tse Huang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Dasen Dai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jen-Yuan Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Youliang Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+W">Wenxiang Jiao</a>, <a href="/search/cs?searchtype=author&query=He%2C+P">Pinjia He</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhaopeng Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16435v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have demonstrated remarkable advancements in multimodal understanding; however, their fundamental visual cognitive abilities remain largely underexplored. To bridge this gap, we introduce VisFactor, a novel benchmark derived from the Factor-Referenced Cognitive Test (FRCT), a well-established psychometric assessment of human cognition. VisFactor digitalizes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16435v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16435v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16435v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have demonstrated remarkable advancements in multimodal understanding; however, their fundamental visual cognitive abilities remain largely underexplored. To bridge this gap, we introduce VisFactor, a novel benchmark derived from the Factor-Referenced Cognitive Test (FRCT), a well-established psychometric assessment of human cognition. VisFactor digitalizes vision-related FRCT subtests to systematically evaluate MLLMs across essential visual cognitive tasks including spatial reasoning, perceptual speed, and pattern recognition. We present a comprehensive evaluation of state-of-the-art MLLMs, such as GPT-4o, Gemini-Pro, and Qwen-VL, using VisFactor under diverse prompting strategies like Chain-of-Thought and Multi-Agent Debate. Our findings reveal a concerning deficiency in current MLLMs' fundamental visual cognition, with performance frequently approaching random guessing and showing only marginal improvements even with advanced prompting techniques. These results underscore the critical need for focused research to enhance the core visual reasoning capabilities of MLLMs. To foster further investigation in this area, we release our VisFactor benchmark at https://github.com/CUHK-ARISE/VisFactor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16435v1-abstract-full').style.display = 'none'; document.getElementById('2502.16435v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in Progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13738">arXiv:2502.13738</a> <span> [<a href="https://arxiv.org/pdf/2502.13738">pdf</a>, <a href="https://arxiv.org/format/2502.13738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Input-Label Mapping in In-Context Learning with Contrastive Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+K">Keqin Peng</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+L">Liang Ding</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Y">Yuanxin Ouyang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yancheng Yuan</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13738v1-abstract-short" style="display: inline;"> Large language models (LLMs) excel at a range of tasks through in-context learning (ICL), where only a few task examples guide their predictions. However, prior research highlights that LLMs often overlook input-label mapping information in ICL, relying more on their pre-trained knowledge. To address this issue, we introduce In-Context Contrastive Decoding (ICCD), a novel method that emphasizes in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13738v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13738v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13738v1-abstract-full" style="display: none;"> Large language models (LLMs) excel at a range of tasks through in-context learning (ICL), where only a few task examples guide their predictions. However, prior research highlights that LLMs often overlook input-label mapping information in ICL, relying more on their pre-trained knowledge. To address this issue, we introduce In-Context Contrastive Decoding (ICCD), a novel method that emphasizes input-label mapping by contrasting the output distributions between positive and negative in-context examples. Experiments on 7 natural language understanding (NLU) tasks show that our ICCD method brings consistent and significant improvement (up to +2.1 improvement on average) upon 6 different scales of LLMs without requiring additional training. Our approach is versatile, enhancing performance with various demonstration selection methods, demonstrating its broad applicability and effectiveness. The code and scripts will be publicly released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13738v1-abstract-full').style.display = 'none'; document.getElementById('2502.13738v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12566">arXiv:2502.12566</a> <span> [<a href="https://arxiv.org/pdf/2502.12566">pdf</a>, <a href="https://arxiv.org/format/2502.12566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Impact of Personality Traits on LLM Bias and Toxicity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Renhao Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yulin Yuan</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+D+F">Derek F. Wong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12566v2-abstract-short" style="display: inline;"> With the different roles that AI is expected to play in human life, imbuing large language models (LLMs) with different personalities has attracted increasing research interests. While the "personification" enhances human experiences of interactivity and adaptability of LLMs, it gives rise to critical concerns about content safety, particularly regarding bias, sentiment and toxicity of LLM generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12566v2-abstract-full').style.display = 'inline'; document.getElementById('2502.12566v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12566v2-abstract-full" style="display: none;"> With the different roles that AI is expected to play in human life, imbuing large language models (LLMs) with different personalities has attracted increasing research interests. While the "personification" enhances human experiences of interactivity and adaptability of LLMs, it gives rise to critical concerns about content safety, particularly regarding bias, sentiment and toxicity of LLM generation. This study explores how assigning different personality traits to LLMs affects the toxicity and biases of their outputs. Leveraging the widely accepted HEXACO personality framework developed in social psychology, we design experimentally sound prompts to test three LLMs' performance on three toxic and bias benchmarks. The findings demonstrate the sensitivity of all three models to HEXACO personality traits and, more importantly, a consistent variation in the biases, negative sentiment and toxicity of their output. In particular, adjusting the levels of several personality traits can effectively reduce bias and toxicity in model performance, similar to humans' correlations between personality traits and toxic behaviors. The findings highlight the additional need to examine content safety besides the efficiency of training or fine-tuning methods for LLM personification. They also suggest a potential for the adjustment of personalities to be a simple and low-cost method to conduct controlled text generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12566v2-abstract-full').style.display = 'none'; document.getElementById('2502.12566v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12171">arXiv:2502.12171</a> <span> [<a href="https://arxiv.org/pdf/2502.12171">pdf</a>, <a href="https://arxiv.org/format/2502.12171">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GoRA: Gradient-driven Adaptive Low Rank Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+H">Haonan He</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+P">Peng Ye</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yuchen Ren</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12171v1-abstract-short" style="display: inline;"> Low-Rank Adaptation (LoRA) is a crucial method for efficiently fine-tuning pretrained large language models (LLMs), with its performance largely influenced by two key factors: rank and initialization strategy. Numerous LoRA variants have been proposed to enhance its performance by addressing these factors. However, these variants often compromise LoRA's usability or efficiency. In this paper, we a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12171v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12171v1-abstract-full" style="display: none;"> Low-Rank Adaptation (LoRA) is a crucial method for efficiently fine-tuning pretrained large language models (LLMs), with its performance largely influenced by two key factors: rank and initialization strategy. Numerous LoRA variants have been proposed to enhance its performance by addressing these factors. However, these variants often compromise LoRA's usability or efficiency. In this paper, we analyze the fundamental limitations of existing methods and introduce a novel approach, GoRA (Gradient-driven Adaptive Low Rank Adaptation), which adaptively assigns ranks and initializes weights for low-rank adapters simultaneously based on gradient information. Extensive experimental results demonstrate that GoRA significantly improves performance while preserving the high usability and efficiency of LoRA. On the T5 model fine-tuned for the GLUE benchmark, GoRA achieves a 5.88-point improvement over LoRA and slightly surpasses full fine-tuning. Similarly, on the Llama3.1-8B-Base model fine-tuned for GSM8k tasks, GoRA outperforms LoRA with a 5.13-point improvement and exceeds full fine-tuning in high-rank settings by a margin of 2.05 points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12171v1-abstract-full').style.display = 'none'; document.getElementById('2502.12171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11442">arXiv:2502.11442</a> <span> [<a href="https://arxiv.org/pdf/2502.11442">pdf</a>, <a href="https://arxiv.org/format/2502.11442">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Turn Multi-Modal Question Clarification for Enhanced Conversational Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ramezan%2C+K">Kimia Ramezan</a>, <a href="/search/cs?searchtype=author&query=Bavandpour%2C+A+A">Alireza Amiri Bavandpour</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yifei Yuan</a>, <a href="/search/cs?searchtype=author&query=Siro%2C+C">Clemencia Siro</a>, <a href="/search/cs?searchtype=author&query=Aliannejadi%2C+M">Mohammad Aliannejadi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11442v1-abstract-short" style="display: inline;"> Conversational query clarification enables users to refine their search queries through interactive dialogue, improving search effectiveness. Traditional approaches rely on text-based clarifying questions, which often fail to capture complex user preferences, particularly those involving visual attributes. While recent work has explored single-turn multi-modal clarification with images alongside t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11442v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11442v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11442v1-abstract-full" style="display: none;"> Conversational query clarification enables users to refine their search queries through interactive dialogue, improving search effectiveness. Traditional approaches rely on text-based clarifying questions, which often fail to capture complex user preferences, particularly those involving visual attributes. While recent work has explored single-turn multi-modal clarification with images alongside text, such methods do not fully support the progressive nature of user intent refinement over multiple turns. Motivated by this, we introduce the Multi-turn Multi-modal Clarifying Questions (MMCQ) task, which combines text and visual modalities to refine user queries in a multi-turn conversation. To facilitate this task, we create a large-scale dataset named ClariMM comprising over 13k multi-turn interactions and 33k question-answer pairs containing multi-modal clarifying questions. We propose Mario, a retrieval framework that employs a two-phase ranking strategy: initial retrieval with BM25, followed by a multi-modal generative re-ranking model that integrates textual and visual information from conversational history. Our experiments show that multi-turn multi-modal clarification outperforms uni-modal and single-turn approaches, improving MRR by 12.88%. The gains are most significant in longer interactions, demonstrating the value of progressive refinement for complex queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11442v1-abstract-full').style.display = 'none'; document.getElementById('2502.11442v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11211">arXiv:2502.11211</a> <span> [<a href="https://arxiv.org/pdf/2502.11211">pdf</a>, <a href="https://arxiv.org/format/2502.11211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Survey of LLM-based Agents in Medicine: How far are we from Baymax? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zizhan Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenghan Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenting Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11211v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are transforming healthcare through the development of LLM-based agents that can understand, reason about, and assist with medical tasks. This survey provides a comprehensive review of LLM-based agents in medicine, examining their architectures, applications, and challenges. We analyze the key components of medical agent systems, including system profiles, clinical pla… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11211v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11211v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are transforming healthcare through the development of LLM-based agents that can understand, reason about, and assist with medical tasks. This survey provides a comprehensive review of LLM-based agents in medicine, examining their architectures, applications, and challenges. We analyze the key components of medical agent systems, including system profiles, clinical planning mechanisms, medical reasoning frameworks, and external capacity enhancement. The survey covers major application scenarios such as clinical decision support, medical documentation, training simulations, and healthcare service optimization. We discuss evaluation frameworks and metrics used to assess these agents' performance in healthcare settings. While LLM-based agents show promise in enhancing healthcare delivery, several challenges remain, including hallucination management, multimodal integration, implementation barriers, and ethical considerations. The survey concludes by highlighting future research directions, including advances in medical reasoning inspired by recent developments in LLM architectures, integration with physical systems, and improvements in training simulations. This work provides researchers and practitioners with a structured overview of the current state and future prospects of LLM-based agents in medicine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11211v1-abstract-full').style.display = 'none'; document.getElementById('2502.11211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11184">arXiv:2502.11184</a> <span> [<a href="https://arxiv.org/pdf/2502.11184">pdf</a>, <a href="https://arxiv.org/format/2502.11184">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Can't See the Forest for the Trees: Benchmarking Multimodal Safety Awareness for Multimodal LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kuiyi Gao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jen-tse Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Youliang Yuan</a>, <a href="/search/cs?searchtype=author&query=He%2C+P">Pinjia He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhaopeng Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11184v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have expanded the capabilities of traditional language models by enabling interaction through both text and images. However, ensuring the safety of these models remains a significant challenge, particularly in accurately identifying whether multimodal content is safe or unsafe-a capability we term safety awareness. In this paper, we introduce MMSafeAware, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11184v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11184v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have expanded the capabilities of traditional language models by enabling interaction through both text and images. However, ensuring the safety of these models remains a significant challenge, particularly in accurately identifying whether multimodal content is safe or unsafe-a capability we term safety awareness. In this paper, we introduce MMSafeAware, the first comprehensive multimodal safety awareness benchmark designed to evaluate MLLMs across 29 safety scenarios with 1500 carefully curated image-prompt pairs. MMSafeAware includes both unsafe and over-safety subsets to assess models abilities to correctly identify unsafe content and avoid over-sensitivity that can hinder helpfulness. Evaluating nine widely used MLLMs using MMSafeAware reveals that current models are not sufficiently safe and often overly sensitive; for example, GPT-4V misclassifies 36.1% of unsafe inputs as safe and 59.9% of benign inputs as unsafe. We further explore three methods to improve safety awareness-prompting-based approaches, visual contrastive decoding, and vision-centric reasoning fine-tuning-but find that none achieve satisfactory performance. Our findings highlight the profound challenges in developing MLLMs with robust safety awareness, underscoring the need for further research in this area. All the code and data will be publicly available to facilitate future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11184v1-abstract-full').style.display = 'none'; document.getElementById('2502.11184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11013">arXiv:2502.11013</a> <span> [<a href="https://arxiv.org/pdf/2502.11013">pdf</a>, <a href="https://arxiv.org/format/2502.11013">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Collaborative Deterministic-Diffusion Model for Probabilistic Urban Spatiotemporal Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sheng%2C+Z">Zhi Sheng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yudi Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+D">Depeng Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11013v2-abstract-short" style="display: inline;"> Accurate prediction of urban spatiotemporal dynamics is essential for enhancing urban management and decision-making. Existing spatiotemporal prediction models are predominantly deterministic, focusing on primary spatiotemporal patterns. However, those dynamics are highly complex, exhibiting multi-modal distributions that are challenging for deterministic models to capture. In this paper, we highl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11013v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11013v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11013v2-abstract-full" style="display: none;"> Accurate prediction of urban spatiotemporal dynamics is essential for enhancing urban management and decision-making. Existing spatiotemporal prediction models are predominantly deterministic, focusing on primary spatiotemporal patterns. However, those dynamics are highly complex, exhibiting multi-modal distributions that are challenging for deterministic models to capture. In this paper, we highlight the critical role of probabilistic prediction in capturing the uncertainties and complexities inherent in spatiotemporal data. While mainstream probabilistic models can capture uncertainty, they struggle with accurately learning primary patterns and often suffer from computational inefficiency. To address these challenges, we propose CoST, which collaborates deterministic and probabilistic models to improve both predictive accuracy and the ability to handle uncertainty. To achieve this, we design a mean-residual decomposition framework, where the mean value is modeled by a deterministic model, and the residual variations are learned by a probabilistic model, specifically diffusion models. Moreover, we introduce a scale-aware diffusion process, which better accounts for spatially heterogeneous dynamics across different regions. Extensive experiments on eight real-world datasets demonstrate that CoST significantly outperforms existing methods in both deterministic and probabilistic metrics, achieving a 20% improvement with low computational cost. CoST bridges the gap between deterministic precision and probabilistic uncertainty, making a significant advancement in the field of urban spatiotemporal prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11013v2-abstract-full').style.display = 'none'; document.getElementById('2502.11013v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11006">arXiv:2502.11006</a> <span> [<a href="https://arxiv.org/pdf/2502.11006">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Prompt Inject Detection with Generative Explanation as an Investigative Tool </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jonathan Pan</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+S+L">Swee Liang Wong</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yidi Yuan</a>, <a href="/search/cs?searchtype=author&query=Chia%2C+X+W">Xin Wei Chia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11006v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are vulnerable to adversarial prompt based injects. These injects could jailbreak or exploit vulnerabilities within these models with explicit prompt requests leading to undesired responses. In the context of investigating prompt injects, the challenge is the sheer volume of input prompts involved that are likely to be largely benign. This investigative challenge is fu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11006v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11006v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11006v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are vulnerable to adversarial prompt based injects. These injects could jailbreak or exploit vulnerabilities within these models with explicit prompt requests leading to undesired responses. In the context of investigating prompt injects, the challenge is the sheer volume of input prompts involved that are likely to be largely benign. This investigative challenge is further complicated by the semantics and subjectivity of the input prompts involved in the LLM conversation with its user and the context of the environment to which the conversation is being carried out. Hence, the challenge for AI security investigators would be two-fold. The first is to identify adversarial prompt injects and then to assess whether the input prompt is contextually benign or adversarial. For the first step, this could be done using existing AI security solutions like guardrails to detect and protect the LLMs. Guardrails have been developed using a variety of approaches. A popular approach is to use signature based. Another popular approach to develop AI models to classify such prompts include the use of NLP based models like a language model. However, in the context of conducting an AI security investigation of prompt injects, these guardrails lack the ability to aid investigators in triaging or assessing the identified input prompts. In this applied research exploration, we explore the use of a text generation capabilities of LLM to detect prompt injects and generate explanation for its detections to aid AI security investigators in assessing and triaging of such prompt inject detections. The practical benefit of such a tool is to ease the task of conducting investigation into prompt injects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11006v1-abstract-full').style.display = 'none'; document.getElementById('2502.11006v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 tables, 3 diagrams</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10999">arXiv:2502.10999</a> <span> [<a href="https://arxiv.org/pdf/2502.10999">pdf</a>, <a href="https://arxiv.org/format/2502.10999">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> ControlText: Unlocking Controllable Fonts in Multilingual Text Rendering without Font Annotations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bowen Jiang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xinyi Bai</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Z">Zhuoqun Hao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+A">Alyson Yin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yaojie Hu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+W">Wenyu Liao</a>, <a href="/search/cs?searchtype=author&query=Ungar%2C+L">Lyle Ungar</a>, <a href="/search/cs?searchtype=author&query=Taylor%2C+C+J">Camillo J. Taylor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10999v1-abstract-short" style="display: inline;"> This work demonstrates that diffusion models can achieve font-controllable multilingual text rendering using just raw images without font label annotations. Visual text rendering remains a significant challenge. While recent methods condition diffusion on glyphs, it is impossible to retrieve exact font annotations from large-scale, real-world datasets, which prevents user-specified font control. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10999v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10999v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10999v1-abstract-full" style="display: none;"> This work demonstrates that diffusion models can achieve font-controllable multilingual text rendering using just raw images without font label annotations. Visual text rendering remains a significant challenge. While recent methods condition diffusion on glyphs, it is impossible to retrieve exact font annotations from large-scale, real-world datasets, which prevents user-specified font control. To address this, we propose a data-driven solution that integrates the conditional diffusion model with a text segmentation model, utilizing segmentation masks to capture and represent fonts in pixel space in a self-supervised manner, thereby eliminating the need for any ground-truth labels and enabling users to customize text rendering with any multilingual font of their choice. The experiment provides a proof of concept of our algorithm in zero-shot text and font editing across diverse fonts and languages, providing valuable insights for the community and industry toward achieving generalized visual text rendering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10999v1-abstract-full').style.display = 'none'; document.getElementById('2502.10999v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is preliminary work and code will be released at github.com/bowen-upenn/ControlText</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10802">arXiv:2502.10802</a> <span> [<a href="https://arxiv.org/pdf/2502.10802">pdf</a>, <a href="https://arxiv.org/format/2502.10802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CoCoEvo: Co-Evolution of Programs and Test Cases to Enhance Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kefan Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hongyue Yu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+T">Tingyu Guo</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shijie Cao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10802v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown remarkable performance in automated code generation. However, existing approaches often rely heavily on pre-defined test cases, which become impractical in scenarios where such cases are unavailable. While prior works explore filtering techniques between programs and test cases, they overlook the refinement of test cases. To address this limitation, we intro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10802v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10802v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown remarkable performance in automated code generation. However, existing approaches often rely heavily on pre-defined test cases, which become impractical in scenarios where such cases are unavailable. While prior works explore filtering techniques between programs and test cases, they overlook the refinement of test cases. To address this limitation, we introduce CoCoEvo, a novel LLM-based co-evolution framework that simultaneously evolves programs and test cases. CoCoEvo eliminates the dependency on pre-defined test cases by generating both programs and test cases directly from natural language problem descriptions and function headers. The framework employs specialized evolutionary operators, including LLM-based crossover and mutation operators for program evolution, along with a test case generation operator for test case evolution. Additionally, we propose optimization strategies such as a crossover rate scheduler to balance exploration and convergence, and a multi-objective optimization method for test case selection. Experimental results on multiple state-of-the-art LLMs demonstrate that CoCoEvo surpasses existing methods, achieving state-of-the-art performance in automated code generation and testing. These results underscore the potential of co-evolutionary techniques in advancing the field of automated programming. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10802v1-abstract-full').style.display = 'none'; document.getElementById('2502.10802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09978">arXiv:2502.09978</a> <span> [<a href="https://arxiv.org/pdf/2502.09978">pdf</a>, <a href="https://arxiv.org/format/2502.09978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> RoadFed: A Multimodal Federated Learning System for Improving Road Safety </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yachao Yuan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yali Yuan</a>, <a href="/search/cs?searchtype=author&query=Baker%2C+T">Thar Baker</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09978v1-abstract-short" style="display: inline;"> Internet of Things (IoTs) have been widely applied in Collaborative Intelligent Transportation Systems (C-ITS) for the prevention of road accidents. As one of the primary causes of road accidents in C-ITS, the efficient detection and early alarm of road hazards are of paramount importance. Given the importance, extensive research has explored this topic and obtained favorable results. However, mos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09978v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09978v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09978v1-abstract-full" style="display: none;"> Internet of Things (IoTs) have been widely applied in Collaborative Intelligent Transportation Systems (C-ITS) for the prevention of road accidents. As one of the primary causes of road accidents in C-ITS, the efficient detection and early alarm of road hazards are of paramount importance. Given the importance, extensive research has explored this topic and obtained favorable results. However, most existing solutions only explore single-modality data, struggle with high computation and communication overhead, or suffer from the curse of high dimensionality in their privacy-preserving methodologies. To overcome these obstacles, in this paper, we introduce RoadFed, an innovative and private multimodal Federated learning-based system tailored for intelligent Road hazard detection and alarm. This framework encompasses an innovative Multimodal Road Hazard Detector, a communication-efficient federated learning approach, and a customized low-error-rate local differential privacy method crafted for high dimensional multimodal data. Experimental results reveal that the proposed RoadFed surpasses most existing systems in the self-gathered real-world and CrisisMMD public datasets. In particular, RoadFed achieves an accuracy of 96.42% with a mere 0.0351 seconds of latency and its communication cost is up to 1,000 times lower than existing systems in this field. It facilitates collaborative training with non-iid high dimensional multimodal real-world data across various data modalities on multiple edges while ensuring privacy preservation for road users. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09978v1-abstract-full').style.display = 'none'; document.getElementById('2502.09978v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09838">arXiv:2502.09838</a> <span> [<a href="https://arxiv.org/pdf/2502.09838">pdf</a>, <a href="https://arxiv.org/format/2502.09838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HealthGPT: A Medical Large Vision-Language Model for Unifying Comprehension and Generation via Heterogeneous Knowledge Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tianwei Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sijing Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuqian Yuan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Binhe Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoyuan Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wanggui He</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mengze Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaohui Song</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siliang Tang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yueting Zhuang</a>, <a href="/search/cs?searchtype=author&query=Ooi%2C+B+C">Beng Chin Ooi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09838v3-abstract-short" style="display: inline;"> We present HealthGPT, a powerful Medical Large Vision-Language Model (Med-LVLM) that integrates medical visual comprehension and generation capabilities within a unified autoregressive paradigm. Our bootstrapping philosophy is to progressively adapt heterogeneous comprehension and generation knowledge to pre-trained large language models (LLMs). This is achieved through a novel heterogeneous low-r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09838v3-abstract-full').style.display = 'inline'; document.getElementById('2502.09838v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09838v3-abstract-full" style="display: none;"> We present HealthGPT, a powerful Medical Large Vision-Language Model (Med-LVLM) that integrates medical visual comprehension and generation capabilities within a unified autoregressive paradigm. Our bootstrapping philosophy is to progressively adapt heterogeneous comprehension and generation knowledge to pre-trained large language models (LLMs). This is achieved through a novel heterogeneous low-rank adaptation (H-LoRA) technique, which is complemented by a tailored hierarchical visual perception approach and a three-stage learning strategy. To effectively learn the HealthGPT, we devise a comprehensive medical domain-specific comprehension and generation dataset called VL-Health. Experimental results demonstrate exceptional performance and scalability of HealthGPT in medical visual unified tasks. Our project can be accessed at https://github.com/DCDmllm/HealthGPT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09838v3-abstract-full').style.display = 'none'; document.getElementById('2502.09838v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Comments: added project page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09541">arXiv:2502.09541</a> <span> [<a href="https://arxiv.org/pdf/2502.09541">pdf</a>, <a href="https://arxiv.org/format/2502.09541">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Vortex: Overcoming Memory Capacity Limitations in GPU-Accelerated Large-Scale Data Analytics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yichao Yuan</a>, <a href="/search/cs?searchtype=author&query=Iyer%2C+A">Advait Iyer</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lin Ma</a>, <a href="/search/cs?searchtype=author&query=Talati%2C+N">Nishil Talati</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09541v1-abstract-short" style="display: inline;"> Despite the high computational throughput of GPUs, limited memory capacity and bandwidth-limited CPU-GPU communication via PCIe links remain significant bottlenecks for accelerating large-scale data analytics workloads. This paper introduces Vortex, a GPU-accelerated framework designed for data analytics workloads that exceed GPU memory capacity. A key aspect of our framework is an optimized IO pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09541v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09541v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09541v1-abstract-full" style="display: none;"> Despite the high computational throughput of GPUs, limited memory capacity and bandwidth-limited CPU-GPU communication via PCIe links remain significant bottlenecks for accelerating large-scale data analytics workloads. This paper introduces Vortex, a GPU-accelerated framework designed for data analytics workloads that exceed GPU memory capacity. A key aspect of our framework is an optimized IO primitive that leverages all available PCIe links in multi-GPU systems for the IO demand of a single target GPU. It routes data through other GPUs to such target GPU that handles IO-intensive analytics tasks. This approach is advantageous when other GPUs are occupied with compute-bound workloads, such as popular AI applications that typically underutilize IO resources. We also introduce a novel programming model that separates GPU kernel development from IO scheduling, reducing programmer burden and enabling GPU code reuse. Additionally, we present the design of certain important query operators and discuss a late materialization technique based on GPU's zero-copy memory access. Without caching any data in GPU memory, Vortex improves the performance of the state-of-the-art GPU baseline, Proteus, by 5.7$\times$ on average and enhances price performance by 2.5$\times$ compared to a CPU-based DuckDB baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09541v1-abstract-full').style.display = 'none'; document.getElementById('2502.09541v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">VLDB 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09423">arXiv:2502.09423</a> <span> [<a href="https://arxiv.org/pdf/2502.09423">pdf</a>, <a href="https://arxiv.org/format/2502.09423">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Transformer-Enhanced Variational Autoencoder for Crystal Structure Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyi Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yang Yuan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siming Zheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jialong Guo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Sihan Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yangang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zongguo Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09423v1-abstract-short" style="display: inline;"> Crystal structure forms the foundation for understanding the physical and chemical properties of materials. Generative models have emerged as a new paradigm in crystal structure prediction(CSP), however, accurately capturing key characteristics of crystal structures, such as periodicity and symmetry, remains a significant challenge. In this paper, we propose a Transformer-Enhanced Variational Auto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09423v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09423v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09423v1-abstract-full" style="display: none;"> Crystal structure forms the foundation for understanding the physical and chemical properties of materials. Generative models have emerged as a new paradigm in crystal structure prediction(CSP), however, accurately capturing key characteristics of crystal structures, such as periodicity and symmetry, remains a significant challenge. In this paper, we propose a Transformer-Enhanced Variational Autoencoder for Crystal Structure Prediction (TransVAE-CSP), who learns the characteristic distribution space of stable materials, enabling both the reconstruction and generation of crystal structures. TransVAE-CSP integrates adaptive distance expansion with irreducible representation to effectively capture the periodicity and symmetry of crystal structures, and the encoder is a transformer network based on an equivariant dot product attention mechanism. Experimental results on the carbon_24, perov_5, and mp_20 datasets demonstrate that TransVAE-CSP outperforms existing methods in structure reconstruction and generation tasks under various modeling metrics, offering a powerful tool for crystal structure design and optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09423v1-abstract-full').style.display = 'none'; document.getElementById('2502.09423v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08309">arXiv:2502.08309</a> <span> [<a href="https://arxiv.org/pdf/2502.08309">pdf</a>, <a href="https://arxiv.org/format/2502.08309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Unlocking Scaling Law in Industrial Recommendation Systems with a Three-step Paradigm based Large User Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+B">Bencheng Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shilei Liu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhiyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yizhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yujin Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Langming Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaqi Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wenbo Su</a>, <a href="/search/cs?searchtype=author&query=Pengjie%2C+W">Wang Pengjie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jian Xu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08309v1-abstract-short" style="display: inline;"> Recent advancements in autoregressive Large Language Models (LLMs) have achieved significant milestones, largely attributed to their scalability, often referred to as the "scaling law". Inspired by these achievements, there has been a growing interest in adapting LLMs for Recommendation Systems (RecSys) by reformulating RecSys tasks into generative problems. However, these End-to-End Generative Re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08309v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08309v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08309v1-abstract-full" style="display: none;"> Recent advancements in autoregressive Large Language Models (LLMs) have achieved significant milestones, largely attributed to their scalability, often referred to as the "scaling law". Inspired by these achievements, there has been a growing interest in adapting LLMs for Recommendation Systems (RecSys) by reformulating RecSys tasks into generative problems. However, these End-to-End Generative Recommendation (E2E-GR) methods tend to prioritize idealized goals, often at the expense of the practical advantages offered by traditional Deep Learning based Recommendation Models (DLRMs) in terms of in features, architecture, and practices. This disparity between idealized goals and practical needs introduces several challenges and limitations, locking the scaling law in industrial RecSys. In this paper, we introduce a large user model (LUM) that addresses these limitations through a three-step paradigm, designed to meet the stringent requirements of industrial settings while unlocking the potential for scalable recommendations. Our extensive experimental evaluations demonstrate that LUM outperforms both state-of-the-art DLRMs and E2E-GR approaches. Notably, LUM exhibits excellent scalability, with performance improvements observed as the model scales up to 7 billion parameters. Additionally, we have successfully deployed LUM in an industrial application, where it achieved significant gains in an A/B test, further validating its effectiveness and practicality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08309v1-abstract-full').style.display = 'none'; document.getElementById('2502.08309v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08281">arXiv:2502.08281</a> <span> [<a href="https://arxiv.org/pdf/2502.08281">pdf</a>, <a href="https://arxiv.org/ps/2502.08281">ps</a>, <a href="https://arxiv.org/format/2502.08281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Redefining Simplicity: Benchmarking Large Language Models from Lexical to Document Simplification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiang%2C+J">Jipeng Qiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minjiang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yi Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yunhao Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kui Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08281v1-abstract-short" style="display: inline;"> Text simplification (TS) refers to the process of reducing the complexity of a text while retaining its original meaning and key information. Existing work only shows that large language models (LLMs) have outperformed supervised non-LLM-based methods on sentence simplification. This study offers the first comprehensive analysis of LLM performance across four TS tasks: lexical, syntactic, sentence… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08281v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08281v1-abstract-full" style="display: none;"> Text simplification (TS) refers to the process of reducing the complexity of a text while retaining its original meaning and key information. Existing work only shows that large language models (LLMs) have outperformed supervised non-LLM-based methods on sentence simplification. This study offers the first comprehensive analysis of LLM performance across four TS tasks: lexical, syntactic, sentence, and document simplification. We compare lightweight, closed-source and open-source LLMs against traditional non-LLM methods using automatic metrics and human evaluations. Our experiments reveal that LLMs not only outperform non-LLM approaches in all four tasks but also often generate outputs that exceed the quality of existing human-annotated references. Finally, we present some future directions of TS in the era of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08281v1-abstract-full').style.display = 'none'; document.getElementById('2502.08281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06805">arXiv:2502.06805</a> <span> [<a href="https://arxiv.org/pdf/2502.06805">pdf</a>, <a href="https://arxiv.org/format/2502.06805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Efficient Diffusion Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+B">Boning Xiong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rui Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shoufa Chen</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zixuan Gong</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+G">Guangyin Bao</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+C">Chaofan Tao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongfeng Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06805v1-abstract-short" style="display: inline;"> Diffusion models have emerged as powerful generative models capable of producing high-quality contents such as images, videos, and audio, demonstrating their potential to revolutionize digital content creation. However, these capabilities come at the cost of their significant computational resources and lengthy generation time, underscoring the critical need to develop efficient techniques for pra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06805v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06805v1-abstract-full" style="display: none;"> Diffusion models have emerged as powerful generative models capable of producing high-quality contents such as images, videos, and audio, demonstrating their potential to revolutionize digital content creation. However, these capabilities come at the cost of their significant computational resources and lengthy generation time, underscoring the critical need to develop efficient techniques for practical deployment. In this survey, we provide a systematic and comprehensive review of research on efficient diffusion models. We organize the literature in a taxonomy consisting of three main categories, covering distinct yet interconnected efficient diffusion model topics from algorithm-level, system-level, and framework perspective, respectively. We have also created a GitHub repository where we organize the papers featured in this survey at https://github.com/AIoT-MLSys-Lab/Efficient-Diffusion-Model-Survey. We hope our survey can serve as a valuable resource to help researchers and practitioners gain a systematic understanding of efficient diffusion model research and inspire them to contribute to this important and exciting field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06805v1-abstract-full').style.display = 'none'; document.getElementById('2502.06805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06327">arXiv:2502.06327</a> <span> [<a href="https://arxiv.org/pdf/2502.06327">pdf</a>, <a href="https://arxiv.org/format/2502.06327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Prompt-Driven Continual Graph Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tianfei Zhou</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+R">Rui Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06327v1-abstract-short" style="display: inline;"> Continual Graph Learning (CGL), which aims to accommodate new tasks over evolving graph data without forgetting prior knowledge, is garnering significant research interest. Mainstream solutions adopt the memory replay-based idea, ie, caching representative data from earlier tasks for retraining the graph model. However, this strategy struggles with scalability issues for constantly evolving graphs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06327v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06327v1-abstract-full" style="display: none;"> Continual Graph Learning (CGL), which aims to accommodate new tasks over evolving graph data without forgetting prior knowledge, is garnering significant research interest. Mainstream solutions adopt the memory replay-based idea, ie, caching representative data from earlier tasks for retraining the graph model. However, this strategy struggles with scalability issues for constantly evolving graphs and raises concerns regarding data privacy. Inspired by recent advancements in the prompt-based learning paradigm, this paper introduces a novel prompt-driven continual graph learning (PROMPTCGL) framework, which learns a separate prompt for each incoming task and maintains the underlying graph neural network model fixed. In this way, PROMPTCGL naturally avoids catastrophic forgetting of knowledge from previous tasks. More specifically, we propose hierarchical prompting to instruct the model from both feature- and topology-level to fully address the variability of task graphs in dynamic continual learning. Additionally, we develop a personalized prompt generator to generate tailored prompts for each graph node while minimizing the number of prompts needed, leading to constant memory consumption regardless of the graph scale. Extensive experiments on four benchmarks show that PROMPTCGL achieves superior performance against existing CGL approaches while significantly reducing memory consumption. Our code is available at https://github.com/QiWang98/PromptCGL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06327v1-abstract-full').style.display = 'none'; document.getElementById('2502.06327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 7figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04728">arXiv:2502.04728</a> <span> [<a href="https://arxiv.org/pdf/2502.04728">pdf</a>, <a href="https://arxiv.org/format/2502.04728">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generating Symbolic World Models via Test-time Scaling of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhouliang Yu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuhuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T+Z">Tim Z. Xiao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F+F">Fuxiang Frank Xia</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+J">Jie Fu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+G">Ge Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiyang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04728v1-abstract-short" style="display: inline;"> Solving complex planning problems requires Large Language Models (LLMs) to explicitly model the state transition to avoid rule violations, comply with constraints, and ensure optimality-a task hindered by the inherent ambiguity of natural language. To overcome such ambiguity, Planning Domain Definition Language (PDDL) is leveraged as a planning abstraction that enables precise and formal state des… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04728v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04728v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04728v1-abstract-full" style="display: none;"> Solving complex planning problems requires Large Language Models (LLMs) to explicitly model the state transition to avoid rule violations, comply with constraints, and ensure optimality-a task hindered by the inherent ambiguity of natural language. To overcome such ambiguity, Planning Domain Definition Language (PDDL) is leveraged as a planning abstraction that enables precise and formal state descriptions. With PDDL, we can generate a symbolic world model where classic searching algorithms, such as A*, can be seamlessly applied to find optimal plans. However, directly generating PDDL domains with current LLMs remains an open challenge due to the lack of PDDL training data. To address this challenge, we propose to scale up the test-time computation of LLMs to enhance their PDDL reasoning capabilities, thereby enabling the generation of high-quality PDDL domains. Specifically, we introduce a simple yet effective algorithm, which first employs a Best-of-N sampling approach to improve the quality of the initial solution and then refines the solution in a fine-grained manner with verbalized machine learning. Our method outperforms o1-mini by a considerable margin in the generation of PDDL domain, achieving over 50% success rate on two tasks (i.e., generating PDDL domains from natural language description or PDDL problems). This is done without requiring additional training. By taking advantage of PDDL as state abstraction, our method is able to outperform current state-of-the-art methods on almost all competition-level planning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04728v1-abstract-full').style.display = 'none'; document.getElementById('2502.04728v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report v1 (32 pages, 6 figures)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03997">arXiv:2502.03997</a> <span> [<a href="https://arxiv.org/pdf/2502.03997">pdf</a>, <a href="https://arxiv.org/format/2502.03997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CAD-Editor: A Locate-then-Infill Framework with Automated Training Data Synthesis for Text-Based CAD Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yu Yuan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shizhao Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+J">Jiang Bian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03997v1-abstract-short" style="display: inline;"> Computer Aided Design (CAD) is indispensable across various industries. \emph{Text-based CAD editing}, which automates the modification of CAD models based on textual instructions, holds great potential but remains underexplored. Existing methods primarily focus on design variation generation or text-based CAD generation, either lacking support for text-based control or neglecting existing CAD mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03997v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03997v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03997v1-abstract-full" style="display: none;"> Computer Aided Design (CAD) is indispensable across various industries. \emph{Text-based CAD editing}, which automates the modification of CAD models based on textual instructions, holds great potential but remains underexplored. Existing methods primarily focus on design variation generation or text-based CAD generation, either lacking support for text-based control or neglecting existing CAD models as constraints. We introduce \emph{CAD-Editor}, the first framework for text-based CAD editing. To address the challenge of demanding triplet data with accurate correspondence for training, we propose an automated data synthesis pipeline. This pipeline utilizes design variation models to generate pairs of original and edited CAD models and employs Large Vision-Language Models (LVLMs) to summarize their differences into editing instructions. To tackle the composite nature of text-based CAD editing, we propose a locate-then-infill framework that decomposes the task into two focused sub-tasks: locating regions requiring modification and infilling these regions with appropriate edits. Large Language Models (LLMs) serve as the backbone for both sub-tasks, leveraging their capabilities in natural language understanding and CAD knowledge. Experiments show that CAD-Editor achieves superior performance both quantitatively and qualitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03997v1-abstract-full').style.display = 'none'; document.getElementById('2502.03997v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03496">arXiv:2502.03496</a> <span> [<a href="https://arxiv.org/pdf/2502.03496">pdf</a>, <a href="https://arxiv.org/format/2502.03496">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> FreqPrior: Improving Video Diffusion Models with Frequency Filtering Gaussian Noise </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yunlong Yuan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuanfan Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chunwei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03496v2-abstract-short" style="display: inline;"> Text-driven video generation has advanced significantly due to developments in diffusion models. Beyond the training and sampling phases, recent studies have investigated noise priors of diffusion models, as improved noise priors yield better generation results. One recent approach employs the Fourier transform to manipulate noise, marking the initial exploration of frequency operations in this co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03496v2-abstract-full').style.display = 'inline'; document.getElementById('2502.03496v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03496v2-abstract-full" style="display: none;"> Text-driven video generation has advanced significantly due to developments in diffusion models. Beyond the training and sampling phases, recent studies have investigated noise priors of diffusion models, as improved noise priors yield better generation results. One recent approach employs the Fourier transform to manipulate noise, marking the initial exploration of frequency operations in this context. However, it often generates videos that lack motion dynamics and imaging details. In this work, we provide a comprehensive theoretical analysis of the variance decay issue present in existing methods, contributing to the loss of details and motion dynamics. Recognizing the critical impact of noise distribution on generation quality, we introduce FreqPrior, a novel noise initialization strategy that refines noise in the frequency domain. Our method features a novel filtering technique designed to address different frequency signals while maintaining the noise prior distribution that closely approximates a standard Gaussian distribution. Additionally, we propose a partial sampling process by perturbing the latent at an intermediate timestep during finding the noise prior, significantly reducing inference time without compromising quality. Extensive experiments on VBench demonstrate that our method achieves the highest scores in both quality and semantic assessments, resulting in the best overall total score. These results highlight the superiority of our proposed noise prior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03496v2-abstract-full').style.display = 'none'; document.getElementById('2502.03496v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01669">arXiv:2502.01669</a> <span> [<a href="https://arxiv.org/pdf/2502.01669">pdf</a>, <a href="https://arxiv.org/format/2502.01669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Addressing Delayed Feedback in Conversion Rate Prediction via Influence Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+C">Chenlu Ding</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiancan Wu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yancheng Yuan</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junfeng Fang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cunchun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01669v1-abstract-short" style="display: inline;"> In the realm of online digital advertising, conversion rate (CVR) prediction plays a pivotal role in maximizing revenue under cost-per-conversion (CPA) models, where advertisers are charged only when users complete specific actions, such as making a purchase. A major challenge in CVR prediction lies in the delayed feedback problem-conversions may occur hours or even weeks after initial user intera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01669v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01669v1-abstract-full" style="display: none;"> In the realm of online digital advertising, conversion rate (CVR) prediction plays a pivotal role in maximizing revenue under cost-per-conversion (CPA) models, where advertisers are charged only when users complete specific actions, such as making a purchase. A major challenge in CVR prediction lies in the delayed feedback problem-conversions may occur hours or even weeks after initial user interactions. This delay complicates model training, as recent data may be incomplete, leading to biases and diminished performance. Although existing methods attempt to address this issue, they often fall short in adapting to evolving user behaviors and depend on auxiliary models, which introduces computational inefficiencies and the risk of model inconsistency. In this work, we propose an Influence Function-empowered framework for Delayed Feedback Modeling (IF-DFM). IF-DFM leverages influence functions to estimate how newly acquired and delayed conversion data impact model parameters, enabling efficient parameter updates without the need for full retraining. Additionally, we present a scalable algorithm that efficiently computes parameter updates by reframing the inverse Hessian-vector product as an optimization problem, striking a balance between computational efficiency and effectiveness. Extensive experiments on benchmark datasets demonstrate that IF-DFM consistently surpasses state-of-the-art methods, significantly enhancing both prediction accuracy and model adaptability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01669v1-abstract-full').style.display = 'none'; document.getElementById('2502.01669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01074">arXiv:2502.01074</a> <span> [<a href="https://arxiv.org/pdf/2502.01074">pdf</a>, <a href="https://arxiv.org/format/2502.01074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Omni-Mol: Exploring Universal Convergent Space for Omni-Molecular Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chengxin Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yihe Yuan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zezheng Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haixin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01074v1-abstract-short" style="display: inline;"> Building generalist models has recently demonstrated remarkable capabilities in diverse scientific domains. Within the realm of molecular learning, several studies have explored unifying diverse tasks across diverse domains. However, negative conflicts and interference between molecules and knowledge from different domain may have a worse impact in threefold. First, conflicting molecular represent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01074v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01074v1-abstract-full" style="display: none;"> Building generalist models has recently demonstrated remarkable capabilities in diverse scientific domains. Within the realm of molecular learning, several studies have explored unifying diverse tasks across diverse domains. However, negative conflicts and interference between molecules and knowledge from different domain may have a worse impact in threefold. First, conflicting molecular representations can lead to optimization difficulties for the models. Second, mixing and scaling up training data across diverse tasks is inherently challenging. Third, the computational cost of refined pretraining is prohibitively high. To address these limitations, this paper presents Omni-Mol, a scalable and unified LLM-based framework for direct instruction tuning. Omni-Mol builds on three key components to tackles conflicts: (1) a unified encoding mechanism for any task input; (2) an active-learning-driven data selection strategy that significantly reduces dataset size; (3) a novel design of the adaptive gradient stabilization module and anchor-and-reconcile MoE framework that ensures stable convergence. Experimentally, Omni-Mol achieves state-of-the-art performance across 15 molecular tasks, demonstrates the presence of scaling laws in the molecular domain, and is supported by extensive ablation studies and analyses validating the effectiveness of its design. The code and weights of the powerful AI-driven chemistry generalist are open-sourced at: https://anonymous.4open.science/r/Omni-Mol-8EDB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01074v1-abstract-full').style.display = 'none'; document.getElementById('2502.01074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 13 figures, 7 tables, paper under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00883">arXiv:2502.00883</a> <span> [<a href="https://arxiv.org/pdf/2502.00883">pdf</a>, <a href="https://arxiv.org/format/2502.00883">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SimPER: A Minimalist Approach to Preference Alignment without Hyperparameters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Teng Xiao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yige Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingxiao Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shangsong Liang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Z">Zhaochun Ren</a>, <a href="/search/cs?searchtype=author&query=Honavar%2C+V+G">Vasant G Honavar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00883v4-abstract-short" style="display: inline;"> Existing preference optimization objectives for language model alignment require additional hyperparameters that must be extensively tuned to achieve optimal performance, increasing both the complexity and time required for fine-tuning large language models. In this paper, we propose a simple yet effective hyperparameter-free preference optimization algorithm for alignment. We observe that promisi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00883v4-abstract-full').style.display = 'inline'; document.getElementById('2502.00883v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00883v4-abstract-full" style="display: none;"> Existing preference optimization objectives for language model alignment require additional hyperparameters that must be extensively tuned to achieve optimal performance, increasing both the complexity and time required for fine-tuning large language models. In this paper, we propose a simple yet effective hyperparameter-free preference optimization algorithm for alignment. We observe that promising performance can be achieved simply by optimizing inverse perplexity, which is calculated as the inverse of the exponentiated average log-likelihood of the chosen and rejected responses in the preference dataset. The resulting simple learning objective, SimPER, is easy to implement and eliminates the need for expensive hyperparameter tuning and a reference model, making it both computationally and memory efficient. Extensive experiments on widely used real-world benchmarks, including MT-Bench, AlpacaEval 2, and 10 key benchmarks of the Open LLM Leaderboard with 5 base models, demonstrate that SimPER consistently and significantly outperforms existing approaches-even without any hyperparameters or a reference model . For example, despite its simplicity, SimPER outperforms state-of-the-art methods by up to 5.7 points on AlpacaEval 2 and achieves the highest average ranking across 10 benchmarks on the Open LLM Leaderboard. The source code for SimPER is publicly available at: https://github.com/tengxiao1/SimPER. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00883v4-abstract-full').style.display = 'none'; document.getElementById('2502.00883v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00835">arXiv:2502.00835</a> <span> [<a href="https://arxiv.org/pdf/2502.00835">pdf</a>, <a href="https://arxiv.org/format/2502.00835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CAIMAN: Causal Action Influence Detection for Sample Efficient Loco-manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuanchen Yuan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jin Cheng</a>, <a href="/search/cs?searchtype=author&query=Urp%C3%AD%2C+N+A">N煤ria Armengol Urp铆</a>, <a href="/search/cs?searchtype=author&query=Coros%2C+S">Stelian Coros</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00835v1-abstract-short" style="display: inline;"> Enabling legged robots to perform non-prehensile loco-manipulation with large and heavy objects is crucial for enhancing their versatility. However, this is a challenging task, often requiring sophisticated planning strategies or extensive task-specific reward shaping, especially in unstructured scenarios with obstacles. In this work, we present CAIMAN, a novel framework for learning loco-manipula… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00835v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00835v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00835v1-abstract-full" style="display: none;"> Enabling legged robots to perform non-prehensile loco-manipulation with large and heavy objects is crucial for enhancing their versatility. However, this is a challenging task, often requiring sophisticated planning strategies or extensive task-specific reward shaping, especially in unstructured scenarios with obstacles. In this work, we present CAIMAN, a novel framework for learning loco-manipulation that relies solely on sparse task rewards. We leverage causal action influence to detect states where the robot is in control over other entities in the environment, and use this measure as an intrinsically motivated objective to enable sample-efficient learning. We employ a hierarchical control strategy, combining a low-level locomotion policy with a high-level policy that prioritizes task-relevant velocity commands. Through simulated and real-world experiments, including object manipulation with obstacles, we demonstrate the framework's superior sample efficiency, adaptability to diverse environments, and successful transfer to hardware without fine-tuning. The proposed approach paves the way for scalable, robust, and autonomous loco-manipulation in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00835v1-abstract-full').style.display = 'none'; document.getElementById('2502.00835v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19054">arXiv:2501.19054</a> <span> [<a href="https://arxiv.org/pdf/2501.19054">pdf</a>, <a href="https://arxiv.org/format/2501.19054">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Text-to-CAD Generation Through Infusing Visual Feedback in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yu Yuan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shizhao Sun</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+J">Jiang Bian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19054v2-abstract-short" style="display: inline;"> Creating Computer-Aided Design (CAD) models requires significant expertise and effort. Text-to-CAD, which converts textual descriptions into CAD parametric sequences, is crucial in streamlining this process. Recent studies have utilized ground-truth parametric sequences, known as sequential signals, as supervision to achieve this goal. However, CAD models are inherently multimodal, comprising para… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19054v2-abstract-full').style.display = 'inline'; document.getElementById('2501.19054v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19054v2-abstract-full" style="display: none;"> Creating Computer-Aided Design (CAD) models requires significant expertise and effort. Text-to-CAD, which converts textual descriptions into CAD parametric sequences, is crucial in streamlining this process. Recent studies have utilized ground-truth parametric sequences, known as sequential signals, as supervision to achieve this goal. However, CAD models are inherently multimodal, comprising parametric sequences and corresponding rendered visual objects. Besides,the rendering process from parametric sequences to visual objects is many-to-one. Therefore, both sequential and visual signals are critical for effective training. In this work, we introduce CADFusion, a framework that uses Large Language Models (LLMs) as the backbone and alternates between two training stages: the sequential learning (SL) stage and the visual feedback (VF) stage. In the SL stage, we train LLMs using ground-truth parametric sequences, enabling the generation of logically coherent parametric sequences. In the VF stage, we reward parametric sequences that render into visually preferred objects and penalize those that do not, allowing LLMs to learn how rendered visual objects are perceived and evaluated. These two stages alternate throughout the training, ensuring balanced learning and preserving benefits of both signals. Experiments demonstrate that CADFusion significantly improves performance, both qualitatively and quantitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19054v2-abstract-full').style.display = 'none'; document.getElementById('2501.19054v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16679">arXiv:2501.16679</a> <span> [<a href="https://arxiv.org/pdf/2501.16679">pdf</a>, <a href="https://arxiv.org/format/2501.16679">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Polyp-Gen: Realistic and Diverse Polyp Image Generation for Endoscopic Dataset Expansion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shengyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhen Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qiushi Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weihao Yu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+D">Di Dong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiancong Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16679v2-abstract-short" style="display: inline;"> Automated diagnostic systems (ADS) have shown significant potential in the early detection of polyps during endoscopic examinations, thereby reducing the incidence of colorectal cancer. However, due to high annotation costs and strict privacy concerns, acquiring high-quality endoscopic images poses a considerable challenge in the development of ADS. Despite recent advancements in generating synthe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16679v2-abstract-full').style.display = 'inline'; document.getElementById('2501.16679v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16679v2-abstract-full" style="display: none;"> Automated diagnostic systems (ADS) have shown significant potential in the early detection of polyps during endoscopic examinations, thereby reducing the incidence of colorectal cancer. However, due to high annotation costs and strict privacy concerns, acquiring high-quality endoscopic images poses a considerable challenge in the development of ADS. Despite recent advancements in generating synthetic images for dataset expansion, existing endoscopic image generation algorithms failed to accurately generate the details of polyp boundary regions and typically required medical priors to specify plausible locations and shapes of polyps, which limited the realism and diversity of the generated images. To address these limitations, we present Polyp-Gen, the first full-automatic diffusion-based endoscopic image generation framework. Specifically, we devise a spatial-aware diffusion training scheme with a lesion-guided loss to enhance the structural context of polyp boundary regions. Moreover, to capture medical priors for the localization of potential polyp areas, we introduce a hierarchical retrieval-based sampling strategy to match similar fine-grained spatial features. In this way, our Polyp-Gen can generate realistic and diverse endoscopic images for building reliable ADS. Extensive experiments demonstrate the state-of-the-art generation quality, and the synthetic images can improve the downstream polyp detection task. Additionally, our Polyp-Gen has shown remarkable zero-shot generalizability on other datasets. The source code is available at https://github.com/CUHK-AIM-Group/Polyp-Gen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16679v2-abstract-full').style.display = 'none'; document.getElementById('2501.16679v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15838">arXiv:2501.15838</a> <span> [<a href="https://arxiv.org/pdf/2501.15838">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CrySPAI: A new Crystal Structure Prediction Software Based on Artificial Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zongguo Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyi Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yang Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yangang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15838v1-abstract-short" style="display: inline;"> Crystal structure predictions based on the combination of first-principles calculations and machine learning have achieved significant success in materials science. However, most of these approaches are limited to predicting specific systems, which hinders their application to unknown or unexplored domains. In this paper, we present CrySPAI, a crystal structure prediction package developed using a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15838v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15838v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15838v1-abstract-full" style="display: none;"> Crystal structure predictions based on the combination of first-principles calculations and machine learning have achieved significant success in materials science. However, most of these approaches are limited to predicting specific systems, which hinders their application to unknown or unexplored domains. In this paper, we present CrySPAI, a crystal structure prediction package developed using artificial intelligence (AI) to predict energetically stable crystal structures of inorganic materials given their chemical compositions. The software consists of three key modules, an evolutionary optimization algorithm (EOA) that searches for all possible crystal structure configurations, density functional theory (DFT) that provides the accurate energy values for these structures, and a deep neural network (DNN) that learns the relationship between crystal structures and their corresponding energies. To optimize the process across these modules, a distributed framework is implemented to parallelize tasks, and an automated workflow has been integrated into CrySPAI for seamless execution. This paper reports the development and implementation of AI AI-based CrySPAI Crystal Prediction Software tool and its unique features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15838v1-abstract-full').style.display = 'none'; document.getElementById('2501.15838v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15574">arXiv:2501.15574</a> <span> [<a href="https://arxiv.org/pdf/2501.15574">pdf</a>, <a href="https://arxiv.org/ps/2501.15574">ps</a>, <a href="https://arxiv.org/format/2501.15574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Instruction Tuning for Story Understanding and Generation with Weak Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yangshu Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Heng Chen</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+C">Christian Ng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15574v1-abstract-short" style="display: inline;"> Story understanding and generation have long been a challenging task in natural language processing (NLP), especially when dealing with various levels of instruction specificity. In this paper, we propose a novel approach called "Weak to Strong Instruction Tuning" for improving story generation by tuning models with instructions of varying clarity. We explore the potential of large language models… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15574v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15574v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15574v1-abstract-full" style="display: none;"> Story understanding and generation have long been a challenging task in natural language processing (NLP), especially when dealing with various levels of instruction specificity. In this paper, we propose a novel approach called "Weak to Strong Instruction Tuning" for improving story generation by tuning models with instructions of varying clarity. We explore the potential of large language models (LLMs) to adapt to different types of instructions, weak and strong, and show that our method significantly enhances performance in story comprehension and generation. By leveraging the strength of instruction tuning, we train models to understand the nuances of story plots, characters, and themes while generating coherent and engaging narratives. Through extensive experiments on several benchmark datasets and comparison with state-of-the-art baselines, we demonstrate that our method outperforms existing techniques, yielding substantial improvements in both automatic evaluation metrics and human evaluations. Our work shows that adaptive instruction tuning can be a powerful tool in refining generative models for complex narrative tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15574v1-abstract-full').style.display = 'none'; document.getElementById('2501.15574v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15268">arXiv:2501.15268</a> <span> [<a href="https://arxiv.org/pdf/2501.15268">pdf</a>, <a href="https://arxiv.org/format/2501.15268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> New Evaluation Paradigm for Lexical Simplification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiang%2C+J">Jipeng Qiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minjiang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yi Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yunhao Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+X">Xiaoye Ouyang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15268v1-abstract-short" style="display: inline;"> Lexical Simplification (LS) methods use a three-step pipeline: complex word identification, substitute generation, and substitute ranking, each with separate evaluation datasets. We found large language models (LLMs) can simplify sentences directly with a single prompt, bypassing the traditional pipeline. However, existing LS datasets are not suitable for evaluating these LLM-generated simplified… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15268v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15268v1-abstract-full" style="display: none;"> Lexical Simplification (LS) methods use a three-step pipeline: complex word identification, substitute generation, and substitute ranking, each with separate evaluation datasets. We found large language models (LLMs) can simplify sentences directly with a single prompt, bypassing the traditional pipeline. However, existing LS datasets are not suitable for evaluating these LLM-generated simplified sentences, as they focus on providing substitutes for single complex words without identifying all complex words in a sentence. To address this gap, we propose a new annotation method for constructing an all-in-one LS dataset through human-machine collaboration. Automated methods generate a pool of potential substitutes, which human annotators then assess, suggesting additional alternatives as needed. Additionally, we explore LLM-based methods with single prompts, in-context learning, and chain-of-thought techniques. We introduce a multi-LLMs collaboration approach to simulate each step of the LS task. Experimental results demonstrate that LS based on multi-LLMs approaches significantly outperforms existing baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15268v1-abstract-full').style.display = 'none'; document.getElementById('2501.15268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13830">arXiv:2501.13830</a> <span> [<a href="https://arxiv.org/pdf/2501.13830">pdf</a>, <a href="https://arxiv.org/format/2501.13830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A space-decoupling framework for optimization on bounded-rank matrices with orthogonally invariant constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yan Yang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Bin Gao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ya-xiang Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13830v1-abstract-short" style="display: inline;"> Imposing additional constraints on low-rank optimization has garnered growing interest. However, the geometry of coupled constraints hampers the well-developed low-rank structure and makes the problem intricate. To this end, we propose a space-decoupling framework for optimization on bounded-rank matrices with orthogonally invariant constraints. The ``space-decoupling" is reflected in several ways… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13830v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13830v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13830v1-abstract-full" style="display: none;"> Imposing additional constraints on low-rank optimization has garnered growing interest. However, the geometry of coupled constraints hampers the well-developed low-rank structure and makes the problem intricate. To this end, we propose a space-decoupling framework for optimization on bounded-rank matrices with orthogonally invariant constraints. The ``space-decoupling" is reflected in several ways. We show that the tangent cone of coupled constraints is the intersection of tangent cones of each constraint. Moreover, we decouple the intertwined bounded-rank and orthogonally invariant constraints into two spaces, leading to optimization on a smooth manifold. Implementing Riemannian algorithms on this manifold is painless as long as the geometry of additional constraints is known. In addition, we unveil the equivalence between the reformulated problem and the original problem. Numerical experiments on real-world applications -- spherical data fitting, graph similarity measuring, low-rank SDP, model reduction of Markov processes, reinforcement learning, and deep learning -- validate the superiority of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13830v1-abstract-full').style.display = 'none'; document.getElementById('2501.13830v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">48 pages, 12 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13794">arXiv:2501.13794</a> <span> [<a href="https://arxiv.org/pdf/2501.13794">pdf</a>, <a href="https://arxiv.org/format/2501.13794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Power of Noise Priors: Enhancing Diffusion Models for Mobile Traffic Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sheng%2C+Z">Zhi Sheng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jingtao Ding</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13794v1-abstract-short" style="display: inline;"> Accurate prediction of mobile traffic, \textit{i.e.,} network traffic from cellular base stations, is crucial for optimizing network performance and supporting urban development. However, the non-stationary nature of mobile traffic, driven by human activity and environmental changes, leads to both regular patterns and abrupt variations. Diffusion models excel in capturing such complex temporal dyn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13794v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13794v1-abstract-full" style="display: none;"> Accurate prediction of mobile traffic, \textit{i.e.,} network traffic from cellular base stations, is crucial for optimizing network performance and supporting urban development. However, the non-stationary nature of mobile traffic, driven by human activity and environmental changes, leads to both regular patterns and abrupt variations. Diffusion models excel in capturing such complex temporal dynamics due to their ability to capture the inherent uncertainties. Most existing approaches prioritize designing novel denoising networks but often neglect the critical role of noise itself, potentially leading to sub-optimal performance. In this paper, we introduce a novel perspective by emphasizing the role of noise in the denoising process. Our analysis reveals that noise fundamentally shapes mobile traffic predictions, exhibiting distinct and consistent patterns. We propose NPDiff, a framework that decomposes noise into \textit{prior} and \textit{residual} components, with the \textit{prior} derived from data dynamics, enhancing the model's ability to capture both regular and abrupt variations. NPDiff can seamlessly integrate with various diffusion-based prediction models, delivering predictions that are effective, efficient, and robust. Extensive experiments demonstrate that it achieves superior performance with an improvement over 30\%, offering a new perspective on leveraging diffusion models in this domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13794v1-abstract-full').style.display = 'none'; document.getElementById('2501.13794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13106">arXiv:2501.13106</a> <span> [<a href="https://arxiv.org/pdf/2501.13106">pdf</a>, <a href="https://arxiv.org/format/2501.13106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kehan Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zesen Cheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiqiang Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuqian Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanzheng Chen</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+S">Sicong Leng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuming Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Peng Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fan Wang</a>, <a href="/search/cs?searchtype=author&query=Bing%2C+L">Lidong Bing</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Deli Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13106v3-abstract-short" style="display: inline;"> In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation model for image and video understanding. The core design philosophy of VideoLLaMA3 is vision-centric. The meaning of "vision-centric" is two-fold: the vision-centric training paradigm and vision-centric framework design. The key insight of our vision-centric training paradigm is that high-quality image-text data is crucia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13106v3-abstract-full').style.display = 'inline'; document.getElementById('2501.13106v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13106v3-abstract-full" style="display: none;"> In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation model for image and video understanding. The core design philosophy of VideoLLaMA3 is vision-centric. The meaning of "vision-centric" is two-fold: the vision-centric training paradigm and vision-centric framework design. The key insight of our vision-centric training paradigm is that high-quality image-text data is crucial for both image and video understanding. Instead of preparing massive video-text datasets, we focus on constructing large-scale and high-quality image-text datasets. VideoLLaMA3 has four training stages: 1) Vision Encoder Adaptation, which enables vision encoder to accept images of variable resolutions as input; 2) Vision-Language Alignment, which jointly tunes the vision encoder, projector, and LLM with large-scale image-text data covering multiple types (including scene images, documents, charts) as well as text-only data. 3) Multi-task Fine-tuning, which incorporates image-text SFT data for downstream tasks and video-text data to establish a foundation for video understanding. 4) Video-centric Fine-tuning, which further improves the model's capability in video understanding. As for the framework design, to better capture fine-grained details in images, the pretrained vision encoder is adapted to encode images of varying sizes into vision tokens with corresponding numbers, rather than a fixed number of tokens. For video inputs, we reduce the number of vision tokens according to their similarity so that the representation of videos will be more precise and compact. Benefit from vision-centric designs, VideoLLaMA3 achieves compelling performances in both image and video understanding benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13106v3-abstract-full').style.display = 'none'; document.getElementById('2501.13106v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">BZ, KL, ZC, ZH, YY, GC, SL, YJ, HZ, and XL contributed equally to this project. Code: https://github.com/DAMO-NLP-SG/VideoLLaMA3</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11222">arXiv:2501.11222</a> <span> [<a href="https://arxiv.org/pdf/2501.11222">pdf</a>, <a href="https://arxiv.org/format/2501.11222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> An Imbalanced Learning-based Sampling Method for Physics-informed Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiaqi Luo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yahong Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shixin Xu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+W">Wenrui Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11222v1-abstract-short" style="display: inline;"> This paper introduces Residual-based Smote (RSmote), an innovative local adaptive sampling technique tailored to improve the performance of Physics-Informed Neural Networks (PINNs) through imbalanced learning strategies. Traditional residual-based adaptive sampling methods, while effective in enhancing PINN accuracy, often struggle with efficiency and high memory consumption, particularly in high-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11222v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11222v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11222v1-abstract-full" style="display: none;"> This paper introduces Residual-based Smote (RSmote), an innovative local adaptive sampling technique tailored to improve the performance of Physics-Informed Neural Networks (PINNs) through imbalanced learning strategies. Traditional residual-based adaptive sampling methods, while effective in enhancing PINN accuracy, often struggle with efficiency and high memory consumption, particularly in high-dimensional problems. RSmote addresses these challenges by targeting regions with high residuals and employing oversampling techniques from imbalanced learning to refine the sampling process. Our approach is underpinned by a rigorous theoretical analysis that supports the effectiveness of RSmote in managing computational resources more efficiently. Through extensive evaluations, we benchmark RSmote against the state-of-the-art Residual-based Adaptive Distribution (RAD) method across a variety of dimensions and differential equations. The results demonstrate that RSmote not only achieves or exceeds the accuracy of RAD but also significantly reduces memory usage, making it particularly advantageous in high-dimensional scenarios. These contributions position RSmote as a robust and resource-efficient solution for solving complex partial differential equations, especially when computational constraints are a critical consideration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11222v1-abstract-full').style.display = 'none'; document.getElementById('2501.11222v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 figures,7 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> G.1.8 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10699">arXiv:2501.10699</a> <span> [<a href="https://arxiv.org/pdf/2501.10699">pdf</a>, <a href="https://arxiv.org/format/2501.10699">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> VENENA: A Deceptive Visual Encryption Framework for Wireless Semantic Secrecy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+B">Bin Han</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&query=Schotten%2C+H+D">Hans D. Schotten</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10699v1-abstract-short" style="display: inline;"> Eavesdropping has been a long-standing threat to the security and privacy of wireless communications, since it is difficult to detect and costly to prevent. As networks evolve towards Sixth Generation (6G) and semantic communication becomes increasingly central to next-generation wireless systems, securing semantic information transmission emerges as a critical challenge. While classical physical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10699v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10699v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10699v1-abstract-full" style="display: none;"> Eavesdropping has been a long-standing threat to the security and privacy of wireless communications, since it is difficult to detect and costly to prevent. As networks evolve towards Sixth Generation (6G) and semantic communication becomes increasingly central to next-generation wireless systems, securing semantic information transmission emerges as a critical challenge. While classical physical layer security (PLS) focuses on passive security, the recently proposed concept of physical layer deception (PLD) offers a semantic encryption measure to actively deceive eavesdroppers. Yet the existing studies of PLD have been dominantly information-theoretical and link-level oriented, lacking considerations of system-level design and practical implementation. In this work we propose a novel artificial intelligence (AI)-enabled framework called Visual ENcryption for Eavesdropping NegAtion (VENENA), which combines the techniques of PLD, visual encryption, and image poisoning, into a comprehensive mechanism for deceptive secure semantic transmission in future wireless networks. By leveraging advanced vision transformers and semantic codecs, VENENA demonstrates how semantic security can be enhanced through the synergy of physical layer techniques and artificial intelligence, paving the way for secure semantic communication in 6G networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10699v1-abstract-full').style.display = 'none'; document.getElementById('2501.10699v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE WCM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10182">arXiv:2501.10182</a> <span> [<a href="https://arxiv.org/pdf/2501.10182">pdf</a>, <a href="https://arxiv.org/format/2501.10182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Secure Semantic Communication With Homomorphic Encryption </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+R">Rui Meng</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Dayu Fan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Haixiao Gao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yifan Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bizhu Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaodong Xu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mengying Sun</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+C">Chen Dong</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xiaofeng Tao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Ping Zhang</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10182v1-abstract-short" style="display: inline;"> In recent years, Semantic Communication (SemCom), which aims to achieve efficient and reliable transmission of meaning between agents, has garnered significant attention from both academia and industry. To ensure the security of communication systems, encryption techniques are employed to safeguard confidentiality and integrity. However, traditional cryptography-based encryption algorithms encount… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10182v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10182v1-abstract-full" style="display: none;"> In recent years, Semantic Communication (SemCom), which aims to achieve efficient and reliable transmission of meaning between agents, has garnered significant attention from both academia and industry. To ensure the security of communication systems, encryption techniques are employed to safeguard confidentiality and integrity. However, traditional cryptography-based encryption algorithms encounter obstacles when applied to SemCom. Motivated by this, this paper explores the feasibility of applying homomorphic encryption to SemCom. Initially, we review the encryption algorithms utilized in mobile communication systems and analyze the challenges associated with their application to SemCom. Subsequently, we employ scale-invariant feature transform to demonstrate that semantic features can be preserved in homomorphic encrypted ciphertext. Based on this finding, we propose a task-oriented SemCom scheme secured through homomorphic encryption. We design the privacy preserved deep joint source-channel coding (JSCC) encoder and decoder, and the frequency of key updates can be adjusted according to service requirements without compromising transmission performance. Simulation results validate that, when compared to plaintext images, the proposed scheme can achieve almost the same classification accuracy performance when dealing with homomorphic ciphertext images. Furthermore, we provide potential future research directions for homomorphic encrypted SemCom. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10182v1-abstract-full').style.display = 'none'; document.getElementById('2501.10182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06425">arXiv:2501.06425</a> <span> [<a href="https://arxiv.org/pdf/2501.06425">pdf</a>, <a href="https://arxiv.org/format/2501.06425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Tensor Product Attention Is All You Need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifeng Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Huizhuo Yuan</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yang Yuan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Quanquan Gu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+A+C">Andrew Chi-Chih Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06425v2-abstract-short" style="display: inline;"> Scaling language models to handle longer input sequences typically necessitates large key-value (KV) caches, resulting in substantial memory overhead during inference. In this paper, we propose Tensor Product Attention (TPA), a novel attention mechanism that uses tensor decompositions to represent queries, keys, and values compactly, significantly shrinking KV cache size at inference time. By fact… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06425v2-abstract-full').style.display = 'inline'; document.getElementById('2501.06425v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06425v2-abstract-full" style="display: none;"> Scaling language models to handle longer input sequences typically necessitates large key-value (KV) caches, resulting in substantial memory overhead during inference. In this paper, we propose Tensor Product Attention (TPA), a novel attention mechanism that uses tensor decompositions to represent queries, keys, and values compactly, significantly shrinking KV cache size at inference time. By factorizing these representations into contextual low-rank components (contextual factorization) and seamlessly integrating with RoPE, TPA achieves improved model quality alongside memory efficiency. Based on TPA, we introduce the Tensor ProducT ATTenTion Transformer (T6), a new model architecture for sequence modeling. Through extensive empirical evaluation of language modeling tasks, we demonstrate that T6 exceeds the performance of standard Transformer baselines including MHA, MQA, GQA, and MLA across various metrics, including perplexity and a range of renowned evaluation benchmarks. Notably, TPA's memory efficiency enables the processing of significantly longer sequences under fixed resource constraints, addressing a critical scalability challenge in modern language models. The code is available at https://github.com/tensorgi/T6. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06425v2-abstract-full').style.display = 'none'; document.getElementById('2501.06425v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05031">arXiv:2501.05031</a> <span> [<a href="https://arxiv.org/pdf/2501.05031">pdf</a>, <a href="https://arxiv.org/format/2501.05031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ECBench: Can Multi-modal Foundation Models Understand the Egocentric World? A Holistic Embodied Cognition Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dang%2C+R">Ronghao Dang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuqian Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Y">Yifei Xin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Long Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liuyi Wang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Qinyang Zeng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Bing%2C+L">Lidong Bing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05031v1-abstract-short" style="display: inline;"> The enhancement of generalization in robots by large vision-language models (LVLMs) is increasingly evident. Therefore, the embodied cognitive abilities of LVLMs based on egocentric videos are of great interest. However, current datasets for embodied video question answering lack comprehensive and systematic evaluation frameworks. Critical embodied cognitive issues, such as robotic self-cognition,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05031v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05031v1-abstract-full" style="display: none;"> The enhancement of generalization in robots by large vision-language models (LVLMs) is increasingly evident. Therefore, the embodied cognitive abilities of LVLMs based on egocentric videos are of great interest. However, current datasets for embodied video question answering lack comprehensive and systematic evaluation frameworks. Critical embodied cognitive issues, such as robotic self-cognition, dynamic scene perception, and hallucination, are rarely addressed. To tackle these challenges, we propose ECBench, a high-quality benchmark designed to systematically evaluate the embodied cognitive abilities of LVLMs. ECBench features a diverse range of scene video sources, open and varied question formats, and 30 dimensions of embodied cognition. To ensure quality, balance, and high visual dependence, ECBench uses class-independent meticulous human annotation and multi-round question screening strategies. Additionally, we introduce ECEval, a comprehensive evaluation system that ensures the fairness and rationality of the indicators. Utilizing ECBench, we conduct extensive evaluations of proprietary, open-source, and task-specific LVLMs. ECBench is pivotal in advancing the embodied cognitive capabilities of LVLMs, laying a solid foundation for developing reliable core models for embodied agents. All data and code are available at https://github.com/Rh-Dang/ECBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05031v1-abstract-full').style.display = 'none'; document.getElementById('2501.05031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03857">arXiv:2501.03857</a> <span> [<a href="https://arxiv.org/pdf/2501.03857">pdf</a>, <a href="https://arxiv.org/format/2501.03857">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Progressive Document-level Text Simplification via Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+D">Dengzhao Fang</a>, <a href="/search/cs?searchtype=author&query=Qiang%2C+J">Jipeng Qiang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yi Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yunhao Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03857v1-abstract-short" style="display: inline;"> Research on text simplification has primarily focused on lexical and sentence-level changes. Long document-level simplification (DS) is still relatively unexplored. Large Language Models (LLMs), like ChatGPT, have excelled in many natural language processing tasks. However, their performance on DS tasks is unsatisfactory, as they often treat DS as merely document summarization. For the DS task, th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03857v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03857v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03857v1-abstract-full" style="display: none;"> Research on text simplification has primarily focused on lexical and sentence-level changes. Long document-level simplification (DS) is still relatively unexplored. Large Language Models (LLMs), like ChatGPT, have excelled in many natural language processing tasks. However, their performance on DS tasks is unsatisfactory, as they often treat DS as merely document summarization. For the DS task, the generated long sequences not only must maintain consistency with the original document throughout, but complete moderate simplification operations encompassing discourses, sentences, and word-level simplifications. Human editors employ a hierarchical complexity simplification strategy to simplify documents. This study delves into simulating this strategy through the utilization of a multi-stage collaboration using LLMs. We propose a progressive simplification method (ProgDS) by hierarchically decomposing the task, including the discourse-level, topic-level, and lexical-level simplification. Experimental results demonstrate that ProgDS significantly outperforms existing smaller models or direct prompting with LLMs, advancing the state-of-the-art in the document simplification task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03857v1-abstract-full').style.display = 'none'; document.getElementById('2501.03857v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03605">arXiv:2501.03605</a> <span> [<a href="https://arxiv.org/pdf/2501.03605">pdf</a>, <a href="https://arxiv.org/format/2501.03605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ConcealGS: Concealing Invisible Copyright Information in 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifeng Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hengyu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenxin Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yining Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wuyang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yiyang Lin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+N">Nanyang Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03605v1-abstract-short" style="display: inline;"> With the rapid development of 3D reconstruction technology, the widespread distribution of 3D data has become a future trend. While traditional visual data (such as images and videos) and NeRF-based formats already have mature techniques for copyright protection, steganographic techniques for the emerging 3D Gaussian Splatting (3D-GS) format have yet to be fully explored. To address this, we propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03605v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03605v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03605v1-abstract-full" style="display: none;"> With the rapid development of 3D reconstruction technology, the widespread distribution of 3D data has become a future trend. While traditional visual data (such as images and videos) and NeRF-based formats already have mature techniques for copyright protection, steganographic techniques for the emerging 3D Gaussian Splatting (3D-GS) format have yet to be fully explored. To address this, we propose ConcealGS, an innovative method for embedding implicit information into 3D-GS. By introducing the knowledge distillation and gradient optimization strategy based on 3D-GS, ConcealGS overcomes the limitations of NeRF-based models and enhances the robustness of implicit information and the quality of 3D reconstruction. We evaluate ConcealGS in various potential application scenarios, and experimental results have demonstrated that ConcealGS not only successfully recovers implicit information but also has almost no impact on rendering quality, providing a new approach for embedding invisible and recoverable information into 3D models in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03605v1-abstract-full').style.display = 'none'; document.getElementById('2501.03605v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02741">arXiv:2501.02741</a> <span> [<a href="https://arxiv.org/pdf/2501.02741">pdf</a>, <a href="https://arxiv.org/format/2501.02741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Brick-Diffusion: Generating Long Videos with Brick-to-Wall Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yunlong Yuan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuanfan Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chunwei Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02741v1-abstract-short" style="display: inline;"> Recent advances in diffusion models have greatly improved text-driven video generation. However, training models for long video generation demands significant computational power and extensive data, leading most video diffusion models to be limited to a small number of frames. Existing training-free methods that attempt to generate long videos using pre-trained short video diffusion models often s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02741v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02741v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02741v1-abstract-full" style="display: none;"> Recent advances in diffusion models have greatly improved text-driven video generation. However, training models for long video generation demands significant computational power and extensive data, leading most video diffusion models to be limited to a small number of frames. Existing training-free methods that attempt to generate long videos using pre-trained short video diffusion models often struggle with issues such as insufficient motion dynamics and degraded video fidelity. In this paper, we present Brick-Diffusion, a novel, training-free approach capable of generating long videos of arbitrary length. Our method introduces a brick-to-wall denoising strategy, where the latent is denoised in segments, with a stride applied in subsequent iterations. This process mimics the construction of a staggered brick wall, where each brick represents a denoised segment, enabling communication between frames and improving overall video quality. Through quantitative and qualitative evaluations, we demonstrate that Brick-Diffusion outperforms existing baseline methods in generating high-fidelity videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02741v1-abstract-full').style.display = 'none'; document.getElementById('2501.02741v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01845">arXiv:2501.01845</a> <span> [<a href="https://arxiv.org/pdf/2501.01845">pdf</a>, <a href="https://arxiv.org/format/2501.01845">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semantic Segmentation for Sequential Historical Maps by Learning from Only One Map </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yunshuang Yuan</a>, <a href="/search/cs?searchtype=author&query=Thiemann%2C+F">Frank Thiemann</a>, <a href="/search/cs?searchtype=author&query=Sester%2C+M">Monika Sester</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01845v1-abstract-short" style="display: inline;"> Historical maps are valuable resources that capture detailed geographical information from the past. However, these maps are typically available in printed formats, which are not conducive to modern computer-based analyses. Digitizing these maps into a machine-readable format enables efficient computational analysis. In this paper, we propose an automated approach to digitization using deep-learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01845v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01845v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01845v1-abstract-full" style="display: none;"> Historical maps are valuable resources that capture detailed geographical information from the past. However, these maps are typically available in printed formats, which are not conducive to modern computer-based analyses. Digitizing these maps into a machine-readable format enables efficient computational analysis. In this paper, we propose an automated approach to digitization using deep-learning-based semantic segmentation, which assigns a semantic label to each pixel in scanned historical maps. A key challenge in this process is the lack of ground-truth annotations required for training deep neural networks, as manual labeling is time-consuming and labor-intensive. To address this issue, we introduce a weakly-supervised age-tracing strategy for model fine-tuning. This approach exploits the similarity in appearance and land-use patterns between historical maps from neighboring time periods to guide the training process. Specifically, model predictions for one map are utilized as pseudo-labels for training on maps from adjacent time periods. Experiments conducted on our newly curated \textit{Hameln} dataset demonstrate that the proposed age-tracing strategy significantly enhances segmentation performance compared to baseline models. In the best-case scenario, the mean Intersection over Union (mIoU) achieved 77.3\%, reflecting an improvement of approximately 20\% over baseline methods. Additionally, the fine-tuned model achieved an average overall accuracy of 97\%, highlighting the effectiveness of our approach for digitizing historical maps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01845v1-abstract-full').style.display = 'none'; document.getElementById('2501.01845v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00742">arXiv:2501.00742</a> <span> [<a href="https://arxiv.org/pdf/2501.00742">pdf</a>, <a href="https://arxiv.org/format/2501.00742">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Experimental Demonstration of an Optical Neural PDE Solver via On-Chip PINN Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yequan Zhao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xian Xiao</a>, <a href="/search/cs?searchtype=author&query=Descos%2C+A">Antoine Descos</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xinling Yu</a>, <a href="/search/cs?searchtype=author&query=Kurczveil%2C+G">Geza Kurczveil</a>, <a href="/search/cs?searchtype=author&query=Fiorentino%2C+M">Marco Fiorentino</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Beausoleil%2C+R+G">Raymond G. Beausoleil</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00742v1-abstract-short" style="display: inline;"> Partial differential equation (PDE) is an important math tool in science and engineering. This paper experimentally demonstrates an optical neural PDE solver by leveraging the back-propagation-free on-photonic-chip training of physics-informed neural networks. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00742v1-abstract-full" style="display: none;"> Partial differential equation (PDE) is an important math tool in science and engineering. This paper experimentally demonstrates an optical neural PDE solver by leveraging the back-propagation-free on-photonic-chip training of physics-informed neural networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00742v1-abstract-full').style.display = 'none'; document.getElementById('2501.00742v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00599">arXiv:2501.00599</a> <span> [<a href="https://arxiv.org/pdf/2501.00599">pdf</a>, <a href="https://arxiv.org/format/2501.00599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with Video LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuqian Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wentong Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zesen Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Long Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Deli Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yueting Zhuang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianke Zhu</a>, <a href="/search/cs?searchtype=author&query=Bing%2C+L">Lidong Bing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00599v2-abstract-short" style="display: inline;"> Video Large Language Models (Video LLMs) have recently exhibited remarkable capabilities in general video understanding. However, they mainly focus on holistic comprehension and struggle with capturing fine-grained spatial and temporal details. Besides, the lack of high-quality object-level video instruction data and a comprehensive benchmark further hinders their advancements. To tackle these cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00599v2-abstract-full').style.display = 'inline'; document.getElementById('2501.00599v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00599v2-abstract-full" style="display: none;"> Video Large Language Models (Video LLMs) have recently exhibited remarkable capabilities in general video understanding. However, they mainly focus on holistic comprehension and struggle with capturing fine-grained spatial and temporal details. Besides, the lack of high-quality object-level video instruction data and a comprehensive benchmark further hinders their advancements. To tackle these challenges, we introduce the VideoRefer Suite to empower Video LLM for finer-level spatial-temporal video understanding, i.e., enabling perception and reasoning on any objects throughout the video. Specially, we thoroughly develop VideoRefer Suite across three essential aspects: dataset, model, and benchmark. Firstly, we introduce a multi-agent data engine to meticulously curate a large-scale, high-quality object-level video instruction dataset, termed VideoRefer-700K. Next, we present the VideoRefer model, which equips a versatile spatial-temporal object encoder to capture precise regional and sequential representations. Finally, we meticulously create a VideoRefer-Bench to comprehensively assess the spatial-temporal understanding capability of a Video LLM, evaluating it across various aspects. Extensive experiments and analyses demonstrate that our VideoRefer model not only achieves promising performance on video referring benchmarks but also facilitates general video understanding capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00599v2-abstract-full').style.display = 'none'; document.getElementById('2501.00599v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 14 figures, technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18551">arXiv:2412.18551</a> <span> [<a href="https://arxiv.org/pdf/2412.18551">pdf</a>, <a href="https://arxiv.org/format/2412.18551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Libra-Leaderboard: Towards Responsible AI through a Balanced Leaderboard of Safety and Capability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haonan Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xudong Han</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Z">Zenan Zhai</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+H">Honglin Mu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Y">Yilin Geng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shom Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Renxi Wang</a>, <a href="/search/cs?searchtype=author&query=Shelmanov%2C+A">Artem Shelmanov</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xiangyu Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxia Wang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+D">Donghai Hong</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Youliang Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Meng Chen</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+H">Haoqin Tu</a>, <a href="/search/cs?searchtype=author&query=Koto%2C+F">Fajri Koto</a>, <a href="/search/cs?searchtype=author&query=Kuribayashi%2C+T">Tatsuki Kuribayashi</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+C">Cong Zeng</a>, <a href="/search/cs?searchtype=author&query=Bhardwaj%2C+R">Rishabh Bhardwaj</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Bingchen Zhao</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yawen Duan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Alghamdi%2C+E+A">Emad A. Alghamdi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a> , et al. (10 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18551v1-abstract-short" style="display: inline;"> To address this gap, we introduce Libra-Leaderboard, a comprehensive framework designed to rank LLMs through a balanced evaluation of performance and safety. Combining a dynamic leaderboard with an interactive LLM arena, Libra-Leaderboard encourages the joint optimization of capability and safety. Unlike traditional approaches that average performance and safety metrics, Libra-Leaderboard uses a d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18551v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18551v1-abstract-full" style="display: none;"> To address this gap, we introduce Libra-Leaderboard, a comprehensive framework designed to rank LLMs through a balanced evaluation of performance and safety. Combining a dynamic leaderboard with an interactive LLM arena, Libra-Leaderboard encourages the joint optimization of capability and safety. Unlike traditional approaches that average performance and safety metrics, Libra-Leaderboard uses a distance-to-optimal-score method to calculate the overall rankings. This approach incentivizes models to achieve a balance rather than excelling in one dimension at the expense of some other ones. In the first release, Libra-Leaderboard evaluates 26 mainstream LLMs from 14 leading organizations, identifying critical safety challenges even in state-of-the-art models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18551v1-abstract-full').style.display = 'none'; document.getElementById('2412.18551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18443">arXiv:2412.18443</a> <span> [<a href="https://arxiv.org/pdf/2412.18443">pdf</a>, <a href="https://arxiv.org/format/2412.18443">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Is Large Language Model Good at Triple Set Prediction? An Empirical Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yajing Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18443v1-abstract-short" style="display: inline;"> The core of the Knowledge Graph Completion (KGC) task is to predict and complete the missing relations or nodes in a KG. Common KGC tasks are mostly about inferring unknown elements with one or two elements being known in a triple. In comparison, the Triple Set Prediction (TSP) task is a more realistic knowledge graph completion task. It aims to predict all elements of unknown triples based on the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18443v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18443v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18443v1-abstract-full" style="display: none;"> The core of the Knowledge Graph Completion (KGC) task is to predict and complete the missing relations or nodes in a KG. Common KGC tasks are mostly about inferring unknown elements with one or two elements being known in a triple. In comparison, the Triple Set Prediction (TSP) task is a more realistic knowledge graph completion task. It aims to predict all elements of unknown triples based on the information from known triples. In recent years, large language models (LLMs) have exhibited significant advancements in language comprehension, demonstrating considerable potential for KGC tasks. However, the potential of LLM on the TSP task has not yet to be investigated. Thus in this paper we proposed a new framework to explore the strengths and limitations of LLM in the TSP task. Specifically, the framework consists of LLM-based rule mining and LLM-based triple set prediction. The relation list of KG embedded within rich semantic information is first leveraged to prompt LLM in the generation of rules. This process is both efficient and independent of statistical information, making it easier to mine effective and realistic rules. For each subgraph, the specified rule is applied in conjunction with the relevant triples within that subgraph to guide the LLM in predicting the missing triples. Subsequently, the predictions from all subgraphs are consolidated to derive the complete set of predicted triples on KG. Finally, the method is evaluated on the relatively complete CFamily dataset. The experimental results indicate that when LLMs are required to adhere to a large amount of factual knowledge to predict missing triples, significant hallucinations occurs, leading to a noticeable decline in performance. To further explore the causes of this phenomenon, this paper presents a comprehensive analysis supported by a detailed case study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18443v1-abstract-full').style.display = 'none'; document.getElementById('2412.18443v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18419">arXiv:2412.18419</a> <span> [<a href="https://arxiv.org/pdf/2412.18419">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Research on the Proximity Relationships of Psychosomatic Disease Knowledge Graph Modules Extracted by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zihan Zhou</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Ziyi Zeng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wenhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yihui Zhu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jiaxin Mao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yonggui Yuan</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Min Xia</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shubin Zhao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+M">Mengyu Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunqian Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18419v1-abstract-short" style="display: inline;"> As social changes accelerate, the incidence of psychosomatic disorders has significantly increased, becoming a major challenge in global health issues. This necessitates an innovative knowledge system and analytical methods to aid in diagnosis and treatment. Here, we establish the ontology model and entity types, using the BERT model and LoRA-tuned LLM for named entity recognition, constructing th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18419v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18419v1-abstract-full" style="display: none;"> As social changes accelerate, the incidence of psychosomatic disorders has significantly increased, becoming a major challenge in global health issues. This necessitates an innovative knowledge system and analytical methods to aid in diagnosis and treatment. Here, we establish the ontology model and entity types, using the BERT model and LoRA-tuned LLM for named entity recognition, constructing the knowledge graph with 9668 triples. Next, by analyzing the network distances between disease, symptom, and drug modules, it was found that closer network distances among diseases can predict greater similarities in their clinical manifestations, treatment approaches, and psychological mechanisms, and closer distances between symptoms indicate that they are more likely to co-occur. Lastly, by comparing the proximity d and proximity z score, it was shown that symptom-disease pairs in primary diagnostic relationships have a stronger association and are of higher referential value than those in diagnostic relationships. The research results revealed the potential connections between diseases, co-occurring symptoms, and similarities in treatment strategies, providing new perspectives for the diagnosis and treatment of psychosomatic disorders and valuable information for future mental health research and practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18419v1-abstract-full').style.display = 'none'; document.getElementById('2412.18419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository