CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 867 results for author: <span class="mathjax">Shen, C</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Shen%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Shen, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Shen%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Shen, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shen%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shen%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08869">arXiv:2502.08869</a> <span> [<a href="https://arxiv.org/pdf/2502.08869">pdf</a>, <a href="https://arxiv.org/format/2502.08869">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Harnessing Vision Models for Time Series Analysis: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jingchao Ni</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ziming Zhao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">ChengAo Shen</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hanghang Tong</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Dongjin Song</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Dongsheng Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08869v1-abstract-short" style="display: inline;"> Time series analysis has witnessed the inspiring development from traditional autoregressive models, deep learning models, to recent Transformers and Large Language Models (LLMs). Efforts in leveraging vision models for time series analysis have also been made along the way but are less visible to the community due to the predominant research on sequence modeling in this domain. However, the discr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08869v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08869v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08869v1-abstract-full" style="display: none;"> Time series analysis has witnessed the inspiring development from traditional autoregressive models, deep learning models, to recent Transformers and Large Language Models (LLMs). Efforts in leveraging vision models for time series analysis have also been made along the way but are less visible to the community due to the predominant research on sequence modeling in this domain. However, the discrepancy between continuous time series and the discrete token space of LLMs, and the challenges in explicitly modeling the correlations of variates in multivariate time series have shifted some research attentions to the equally successful Large Vision Models (LVMs) and Vision Language Models (VLMs). To fill the blank in the existing literature, this survey discusses the advantages of vision models over LLMs in time series analysis. It provides a comprehensive and in-depth overview of the existing methods, with dual views of detailed taxonomy that answer the key research questions including how to encode time series as images and how to model the imaged time series for various tasks. Additionally, we address the challenges in the pre- and post-processing steps involved in this framework and outline future directions to further advance time series analysis with vision models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08869v1-abstract-full').style.display = 'none'; document.getElementById('2502.08869v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07557">arXiv:2502.07557</a> <span> [<a href="https://arxiv.org/pdf/2502.07557">pdf</a>, <a href="https://arxiv.org/format/2502.07557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> JBShield: Defending Large Language Models from Jailbreak Attacks through Activated Concept Analysis and Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shenyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yuchen Zhai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+K">Keyan Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hongxin Hu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengnan Guo</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lingchen Zhao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07557v1-abstract-short" style="display: inline;"> Despite the implementation of safety alignment strategies, large language models (LLMs) remain vulnerable to jailbreak attacks, which undermine these safety guardrails and pose significant security threats. Some defenses have been proposed to detect or mitigate jailbreaks, but they are unable to withstand the test of time due to an insufficient understanding of jailbreak mechanisms. In this work,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07557v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07557v1-abstract-full" style="display: none;"> Despite the implementation of safety alignment strategies, large language models (LLMs) remain vulnerable to jailbreak attacks, which undermine these safety guardrails and pose significant security threats. Some defenses have been proposed to detect or mitigate jailbreaks, but they are unable to withstand the test of time due to an insufficient understanding of jailbreak mechanisms. In this work, we investigate the mechanisms behind jailbreaks based on the Linear Representation Hypothesis (LRH), which states that neural networks encode high-level concepts as subspaces in their hidden representations. We define the toxic semantics in harmful and jailbreak prompts as toxic concepts and describe the semantics in jailbreak prompts that manipulate LLMs to comply with unsafe requests as jailbreak concepts. Through concept extraction and analysis, we reveal that LLMs can recognize the toxic concepts in both harmful and jailbreak prompts. However, unlike harmful prompts, jailbreak prompts activate the jailbreak concepts and alter the LLM output from rejection to compliance. Building on our analysis, we propose a comprehensive jailbreak defense framework, JBShield, consisting of two key components: jailbreak detection JBShield-D and mitigation JBShield-M. JBShield-D identifies jailbreak prompts by determining whether the input activates both toxic and jailbreak concepts. When a jailbreak prompt is detected, JBShield-M adjusts the hidden representations of the target LLM by enhancing the toxic concept and weakening the jailbreak concept, ensuring LLMs produce safe content. Extensive experiments demonstrate the superior performance of JBShield, achieving an average detection accuracy of 0.95 and reducing the average attack success rate of various jailbreak attacks to 2% from 61% across distinct LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07557v1-abstract-full').style.display = 'none'; document.getElementById('2502.07557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To Appear in the 34rd USENIX Security Symposium, August 13-15, 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05855">arXiv:2502.05855</a> <span> [<a href="https://arxiv.org/pdf/2502.05855">pdf</a>, <a href="https://arxiv.org/format/2502.05855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DexVLA: Vision-Language Model with Plug-In Diffusion Expert for General Robot Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+J">Junjie Wen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yichen Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinming Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhibin Tang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chaomin Shen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Feifei Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05855v1-abstract-short" style="display: inline;"> Enabling robots to perform diverse tasks across varied environments is a central challenge in robot learning. While vision-language-action (VLA) models have shown promise for generalizable robot skills, realizing their full potential requires addressing limitations in action representation and efficient training. Current VLA models often focus on scaling the vision-language model (VLM) component,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05855v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05855v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05855v1-abstract-full" style="display: none;"> Enabling robots to perform diverse tasks across varied environments is a central challenge in robot learning. While vision-language-action (VLA) models have shown promise for generalizable robot skills, realizing their full potential requires addressing limitations in action representation and efficient training. Current VLA models often focus on scaling the vision-language model (VLM) component, while the action space representation remains a critical bottleneck. This paper introduces DexVLA, a novel framework designed to enhance the efficiency and generalization capabilities of VLAs for complex, long-horizon tasks across diverse robot embodiments. DexVLA features a novel diffusion-based action expert, scaled to one billion parameters, designed for cross-embodiment learning. A novel embodiment curriculum learning strategy facilitates efficient training: (1) pre-training the diffusion expert that is separable from the VLA on cross-embodiment data, (2) aligning the VLA model to specific embodiments, and (3) post-training for rapid adaptation to new tasks. We conduct comprehensive experiments across multiple embodiments, including single-arm, bimanual, and dexterous hand, demonstrating DexVLA's adaptability to challenging tasks without task-specific adaptation, its ability to learn dexterous skills on novel embodiments with limited data, and its capacity to complete complex, long-horizon tasks using only direct language prompting, such as laundry folding. In all settings, our method demonstrates superior performance compared to state-of-the-art models like Octo, OpenVLA, and Diffusion Policy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05855v1-abstract-full').style.display = 'none'; document.getElementById('2502.05855v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The webpage is at https://dex-vla.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05503">arXiv:2502.05503</a> <span> [<a href="https://arxiv.org/pdf/2502.05503">pdf</a>, <a href="https://arxiv.org/format/2502.05503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Physical Coherence Benchmark for Evaluating Video Generation Models via Optical Flow-guided Frame Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongfan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiuwen Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyu Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05503v1-abstract-short" style="display: inline;"> Recent advances in video generation models demonstrate their potential as world simulators, but they often struggle with videos deviating from physical laws, a key concern overlooked by most text-to-video benchmarks. We introduce a benchmark designed specifically to assess the Physical Coherence of generated videos, PhyCoBench. Our benchmark includes 120 prompts covering 7 categories of physical p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05503v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05503v1-abstract-full" style="display: none;"> Recent advances in video generation models demonstrate their potential as world simulators, but they often struggle with videos deviating from physical laws, a key concern overlooked by most text-to-video benchmarks. We introduce a benchmark designed specifically to assess the Physical Coherence of generated videos, PhyCoBench. Our benchmark includes 120 prompts covering 7 categories of physical principles, capturing key physical laws observable in video content. We evaluated four state-of-the-art (SoTA) T2V models on PhyCoBench and conducted manual assessments. Additionally, we propose an automated evaluation model: PhyCoPredictor, a diffusion model that generates optical flow and video frames in a cascade manner. Through a consistency evaluation comparing automated and manual sorting, the experimental results show that PhyCoPredictor currently aligns most closely with human evaluation. Therefore, it can effectively evaluate the physical coherence of videos, providing insights for future model optimization. Our benchmark, which includes physical coherence prompts, automatic evaluation tool PhyCoPredictor, and generated video dataset, will all be released on GitHub shortly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05503v1-abstract-full').style.display = 'none'; document.getElementById('2502.05503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17983">arXiv:2501.17983</a> <span> [<a href="https://arxiv.org/pdf/2501.17983">pdf</a>, <a href="https://arxiv.org/format/2501.17983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Feature Fusion for UAV Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xudong Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yaxin Peng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chaomin Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17983v2-abstract-short" style="display: inline;"> Object detection in unmanned aerial vehicle (UAV) remote sensing images poses significant challenges due to unstable image quality, small object sizes, complex backgrounds, and environmental occlusions. Small objects, in particular, occupy small portions of images, making their accurate detection highly difficult. Existing multi-scale feature fusion methods address these challenges to some extent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17983v2-abstract-full').style.display = 'inline'; document.getElementById('2501.17983v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17983v2-abstract-full" style="display: none;"> Object detection in unmanned aerial vehicle (UAV) remote sensing images poses significant challenges due to unstable image quality, small object sizes, complex backgrounds, and environmental occlusions. Small objects, in particular, occupy small portions of images, making their accurate detection highly difficult. Existing multi-scale feature fusion methods address these challenges to some extent by aggregating features across different resolutions. However, they often fail to effectively balance the classification and localization performance for small objects, primarily due to insufficient feature representation and imbalanced network information flow. In this paper, we propose a novel feature fusion framework specifically designed for UAV object detection tasks to enhance both localization accuracy and classification performance. The proposed framework integrates hybrid upsampling and downsampling modules, enabling feature maps from different network depths to be flexibly adjusted to arbitrary resolutions. This design facilitates cross-layer connections and multi-scale feature fusion, ensuring improved representation of small objects. Our approach leverages hybrid downsampling to enhance fine-grained feature representation, improving spatial localization of small targets, even under complex conditions. Simultaneously, the upsampling module aggregates global contextual information, optimizing feature consistency across scales and enhancing classification robustness in cluttered scenes. Experimental results on two public UAV datasets demonstrate the effectiveness of the proposed framework. Integrated into the YOLO-v10 model, our method achieves a 2% improvement in average precision (AP) compared to the baseline YOLO-v10 model, while maintaining the same number of parameters. These results highlight the potential of our framework for accurate and efficient UAV object detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17983v2-abstract-full').style.display = 'none'; document.getElementById('2501.17983v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14939">arXiv:2501.14939</a> <span> [<a href="https://arxiv.org/pdf/2501.14939">pdf</a>, <a href="https://arxiv.org/format/2501.14939">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Principal Graph Encoder Embedding and Principal Community Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cencheng Shen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuexiao Dong</a>, <a href="/search/cs?searchtype=author&query=Priebe%2C+C+E">Carey E. Priebe</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+J">Jonathan Larson</a>, <a href="/search/cs?searchtype=author&query=Trinh%2C+H">Ha Trinh</a>, <a href="/search/cs?searchtype=author&query=Park%2C+Y">Youngser Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14939v1-abstract-short" style="display: inline;"> In this paper, we introduce the concept of principal communities and propose a principal graph encoder embedding method that concurrently detects these communities and achieves vertex embedding. Given a graph adjacency matrix with vertex labels, the method computes a sample community score for each community, ranking them to measure community importance and estimate a set of principal communities.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14939v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14939v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14939v1-abstract-full" style="display: none;"> In this paper, we introduce the concept of principal communities and propose a principal graph encoder embedding method that concurrently detects these communities and achieves vertex embedding. Given a graph adjacency matrix with vertex labels, the method computes a sample community score for each community, ranking them to measure community importance and estimate a set of principal communities. The method then produces a vertex embedding by retaining only the dimensions corresponding to these principal communities. Theoretically, we define the population version of the encoder embedding and the community score based on a random Bernoulli graph distribution. We prove that the population principal graph encoder embedding preserves the conditional density of the vertex labels and that the population community score successfully distinguishes the principal communities. We conduct a variety of simulations to demonstrate the finite-sample accuracy in detecting ground-truth principal communities, as well as the advantages in embedding visualization and subsequent vertex classification. The method is further applied to a set of real-world graphs, showcasing its numerical advantages, including robustness to label noise and computational scalability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14939v1-abstract-full').style.display = 'none'; document.getElementById('2501.14939v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 page main + 5 page appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14932">arXiv:2501.14932</a> <span> [<a href="https://arxiv.org/pdf/2501.14932">pdf</a>, <a href="https://arxiv.org/format/2501.14932">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Explaining Categorical Feature Interactions Using Graph Covariance and LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cencheng Shen</a>, <a href="/search/cs?searchtype=author&query=Edge%2C+D">Darren Edge</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+J">Jonathan Larson</a>, <a href="/search/cs?searchtype=author&query=Priebe%2C+C+E">Carey E. Priebe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14932v1-abstract-short" style="display: inline;"> Modern datasets often consist of numerous samples with abundant features and associated timestamps. Analyzing such datasets to uncover underlying events typically requires complex statistical methods and substantial domain expertise. A notable example, and the primary data focus of this paper, is the global synthetic dataset from the Counter Trafficking Data Collaborative (CTDC) -- a global hub of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14932v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14932v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14932v1-abstract-full" style="display: none;"> Modern datasets often consist of numerous samples with abundant features and associated timestamps. Analyzing such datasets to uncover underlying events typically requires complex statistical methods and substantial domain expertise. A notable example, and the primary data focus of this paper, is the global synthetic dataset from the Counter Trafficking Data Collaborative (CTDC) -- a global hub of human trafficking data containing over 200,000 anonymized records spanning from 2002 to 2022, with numerous categorical features for each record. In this paper, we propose a fast and scalable method for analyzing and extracting significant categorical feature interactions, and querying large language models (LLMs) to generate data-driven insights that explain these interactions. Our approach begins with a binarization step for categorical features using one-hot encoding, followed by the computation of graph covariance at each time. This graph covariance quantifies temporal changes in dependence structures within categorical data and is established as a consistent dependence measure under the Bernoulli distribution. We use this measure to identify significant feature pairs, such as those with the most frequent trends over time or those exhibiting sudden spikes in dependence at specific moments. These extracted feature pairs, along with their timestamps, are subsequently passed to an LLM tasked with generating potential explanations of the underlying events driving these dependence changes. The effectiveness of our method is demonstrated through extensive simulations, and its application to the CTDC dataset reveals meaningful feature pairs and potential data stories underlying the observed feature interactions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14932v1-abstract-full').style.display = 'none'; document.getElementById('2501.14932v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages main + 6 pages appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07849">arXiv:2501.07849</a> <span> [<a href="https://arxiv.org/pdf/2501.07849">pdf</a>, <a href="https://arxiv.org/format/2501.07849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Unveiling Provider Bias in Large Language Models for Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+J">Juan Zhai</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shiqing Ma</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Q">Qingshuang Bao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Weipeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07849v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have emerged as the new recommendation engines, outperforming traditional methods in both capability and scope, particularly in code generation applications. Our research reveals a novel provider bias in LLMs, namely without explicit input prompts, these models show systematic preferences for services from specific providers in their recommendations (e.g., favoring Goo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07849v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07849v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have emerged as the new recommendation engines, outperforming traditional methods in both capability and scope, particularly in code generation applications. Our research reveals a novel provider bias in LLMs, namely without explicit input prompts, these models show systematic preferences for services from specific providers in their recommendations (e.g., favoring Google Cloud over Microsoft Azure). This bias holds significant implications for market dynamics and societal equilibrium, potentially promoting digital monopolies. It may also deceive users and violate their expectations, leading to various consequences. This paper presents the first comprehensive empirical study of provider bias in LLM code generation. We develop a systematic methodology encompassing an automated pipeline for dataset generation, incorporating 6 distinct coding task categories and 30 real-world application scenarios. Our analysis encompasses over 600,000 LLM-generated responses across seven state-of-the-art models, utilizing approximately 500 million tokens (equivalent to \$5,000+ in computational costs). The study evaluates both the generated code snippets and their embedded service provider selections to quantify provider bias. Additionally, we conduct a comparative analysis of seven debiasing prompting techniques to assess their efficacy in mitigating these biases. Our findings demonstrate that LLMs exhibit significant provider preferences, predominantly favoring services from Google and Amazon, and can autonomously modify input code to incorporate their preferred providers without users' requests. Notably, we observe discrepancies between providers recommended in conversational contexts versus those implemented in generated code. The complete dataset and analysis results are available in our repository. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07849v1-abstract-full').style.display = 'none'; document.getElementById('2501.07849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06700">arXiv:2501.06700</a> <span> [<a href="https://arxiv.org/pdf/2501.06700">pdf</a>, <a href="https://arxiv.org/format/2501.06700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Average Reward Reinforcement Learning for Wireless Radio Resource Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kun Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06700v1-abstract-short" style="display: inline;"> In this paper, we address a crucial but often overlooked issue in applying reinforcement learning (RL) to radio resource management (RRM) in wireless communications: the mismatch between the discounted reward RL formulation and the undiscounted goal of wireless network optimization. To the best of our knowledge, we are the first to systematically investigate this discrepancy, starting with a discu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06700v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06700v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06700v1-abstract-full" style="display: none;"> In this paper, we address a crucial but often overlooked issue in applying reinforcement learning (RL) to radio resource management (RRM) in wireless communications: the mismatch between the discounted reward RL formulation and the undiscounted goal of wireless network optimization. To the best of our knowledge, we are the first to systematically investigate this discrepancy, starting with a discussion of the problem formulation followed by simulations that quantify the extent of the gap. To bridge this gap, we introduce the use of average reward RL, a method that aligns more closely with the long-term objectives of RRM. We propose a new method called the Average Reward Off policy Soft Actor Critic (ARO SAC) is an adaptation of the well known Soft Actor Critic algorithm in the average reward framework. This new method achieves significant performance improvement our simulation results demonstrate a 15% gain in the system performance over the traditional discounted reward RL approach, underscoring the potential of average reward RL in enhancing the efficiency and effectiveness of wireless network optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06700v1-abstract-full').style.display = 'none'; document.getElementById('2501.06700v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Asilomar 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02997">arXiv:2501.02997</a> <span> [<a href="https://arxiv.org/pdf/2501.02997">pdf</a>, <a href="https://arxiv.org/format/2501.02997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CALM: Curiosity-Driven Auditing for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiang Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Longxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02997v1-abstract-short" style="display: inline;"> Auditing Large Language Models (LLMs) is a crucial and challenging task. In this study, we focus on auditing black-box LLMs without access to their parameters, only to the provided service. We treat this type of auditing as a black-box optimization problem where the goal is to automatically uncover input-output pairs of the target LLMs that exhibit illegal, immoral, or unsafe behaviors. For instan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02997v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02997v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02997v1-abstract-full" style="display: none;"> Auditing Large Language Models (LLMs) is a crucial and challenging task. In this study, we focus on auditing black-box LLMs without access to their parameters, only to the provided service. We treat this type of auditing as a black-box optimization problem where the goal is to automatically uncover input-output pairs of the target LLMs that exhibit illegal, immoral, or unsafe behaviors. For instance, we may seek a non-toxic input that the target LLM responds to with a toxic output or an input that induces the hallucinative response from the target LLM containing politically sensitive individuals. This black-box optimization is challenging due to the scarcity of feasible points, the discrete nature of the prompt space, and the large search space. To address these challenges, we propose Curiosity-Driven Auditing for Large Language Models (CALM), which uses intrinsically motivated reinforcement learning to finetune an LLM as the auditor agent to uncover potential harmful and biased input-output pairs of the target LLM. CALM successfully identifies derogatory completions involving celebrities and uncovers inputs that elicit specific names under the black-box setting. This work offers a promising direction for auditing black-box LLMs. Our code is available at https://github.com/x-zheng16/CALM.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02997v1-abstract-full').style.display = 'none'; document.getElementById('2501.02997v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025 AI Alignment Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02198">arXiv:2501.02198</a> <span> [<a href="https://arxiv.org/pdf/2501.02198">pdf</a>, <a href="https://arxiv.org/format/2501.02198">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fresh-CL: Feature Realignment through Experts on Hypersphere in Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhongyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yaxin Peng</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+P">Pin Yi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minjie Zhu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chaomin Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02198v2-abstract-short" style="display: inline;"> Continual Learning enables models to learn and adapt to new tasks while retaining prior knowledge. Introducing new tasks, however, can naturally lead to feature entanglement across tasks, limiting the model's capability to distinguish between new domain data. In this work, we propose a method called Feature Realignment through Experts on hyperSpHere in Continual Learning (Fresh-CL). By leveraging… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02198v2-abstract-full').style.display = 'inline'; document.getElementById('2501.02198v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02198v2-abstract-full" style="display: none;"> Continual Learning enables models to learn and adapt to new tasks while retaining prior knowledge. Introducing new tasks, however, can naturally lead to feature entanglement across tasks, limiting the model's capability to distinguish between new domain data. In this work, we propose a method called Feature Realignment through Experts on hyperSpHere in Continual Learning (Fresh-CL). By leveraging predefined and fixed simplex equiangular tight frame (ETF) classifiers on a hypersphere, our model improves feature separation both intra and inter tasks. However, the projection to a simplex ETF shifts with new tasks, disrupting structured feature representation of previous tasks and degrading performance. Therefore, we propose a dynamic extension of ETF through mixture of experts, enabling adaptive projections onto diverse subspaces to enhance feature representation. Experiments on 11 datasets demonstrate a 2% improvement in accuracy compared to the strongest baseline, particularly in fine-grained datasets, confirming the efficacy of combining ETF and MoE to improve feature distinction in continual learning scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02198v2-abstract-full').style.display = 'none'; document.getElementById('2501.02198v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20404">arXiv:2412.20404</a> <span> [<a href="https://arxiv.org/pdf/2412.20404">pdf</a>, <a href="https://arxiv.org/format/2412.20404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Open-Sora: Democratizing Efficient Video Production for All </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zangwei Zheng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiangyu Peng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianji Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chenhui Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shenggui Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongxin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yukun Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyi Li</a>, <a href="/search/cs?searchtype=author&query=You%2C+Y">Yang You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20404v1-abstract-short" style="display: inline;"> Vision and language are the two foundational senses for humans, and they build up our cognitive ability and intelligence. While significant breakthroughs have been made in AI language ability, artificial visual intelligence, especially the ability to generate and simulate the world we see, is far lagging behind. To facilitate the development and accessibility of artificial visual intelligence, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20404v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20404v1-abstract-full" style="display: none;"> Vision and language are the two foundational senses for humans, and they build up our cognitive ability and intelligence. While significant breakthroughs have been made in AI language ability, artificial visual intelligence, especially the ability to generate and simulate the world we see, is far lagging behind. To facilitate the development and accessibility of artificial visual intelligence, we created Open-Sora, an open-source video generation model designed to produce high-fidelity video content. Open-Sora supports a wide spectrum of visual generation tasks, including text-to-image generation, text-to-video generation, and image-to-video generation. The model leverages advanced deep learning architectures and training/inference techniques to enable flexible video synthesis, which could generate video content of up to 15 seconds, up to 720p resolution, and arbitrary aspect ratios. Specifically, we introduce Spatial-Temporal Diffusion Transformer (STDiT), an efficient diffusion framework for videos that decouples spatial and temporal attention. We also introduce a highly compressive 3D autoencoder to make representations compact and further accelerate training with an ad hoc training strategy. Through this initiative, we aim to foster innovation, creativity, and inclusivity within the community of AI content creation. By embracing the open-source principle, Open-Sora democratizes full access to all the training/inference/data preparation codes as well as model weights. All resources are publicly available at: https://github.com/hpcaitech/Open-Sora. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20404v1-abstract-full').style.display = 'none'; document.getElementById('2412.20404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18844">arXiv:2412.18844</a> <span> [<a href="https://arxiv.org/pdf/2412.18844">pdf</a>, <a href="https://arxiv.org/format/2412.18844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving Integrated Gradient-based Transferable Adversarial Examples by Refining the Integration Path </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yuchen Ren</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhengyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenhao Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lu Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhe Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18844v1-abstract-short" style="display: inline;"> Transferable adversarial examples are known to cause threats in practical, black-box attack scenarios. A notable approach to improving transferability is using integrated gradients (IG), originally developed for model interpretability. In this paper, we find that existing IG-based attacks have limited transferability due to their naive adoption of IG in model interpretability. To address this limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18844v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18844v1-abstract-full" style="display: none;"> Transferable adversarial examples are known to cause threats in practical, black-box attack scenarios. A notable approach to improving transferability is using integrated gradients (IG), originally developed for model interpretability. In this paper, we find that existing IG-based attacks have limited transferability due to their naive adoption of IG in model interpretability. To address this limitation, we focus on the IG integration path and refine it in three aspects: multiplicity, monotonicity, and diversity, supported by theoretical analyses. We propose the Multiple Monotonic Diversified Integrated Gradients (MuMoDIG) attack, which can generate highly transferable adversarial examples on different CNN and ViT models and defenses. Experiments validate that MuMoDIG outperforms the latest IG-based attack by up to 37.3\% and other state-of-the-art attacks by 8.4\%. In general, our study reveals that migrating established techniques to improve transferability may require non-trivial efforts. Code is available at \url{https://github.com/RYC-98/MuMoDIG}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18844v1-abstract-full').style.display = 'none'; document.getElementById('2412.18844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18833">arXiv:2412.18833</a> <span> [<a href="https://arxiv.org/pdf/2412.18833">pdf</a>, <a href="https://arxiv.org/format/2412.18833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Federated Learning with Partially Labeled Data: A Conditional Distillation Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pochuan Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&query=Oda%2C+M">Masahiro Oda</a>, <a href="/search/cs?searchtype=author&query=Fuh%2C+C">Chiou-Shann Fuh</a>, <a href="/search/cs?searchtype=author&query=Mori%2C+K">Kensaku Mori</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weichung Wang</a>, <a href="/search/cs?searchtype=author&query=Roth%2C+H+R">Holger R. Roth</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18833v1-abstract-short" style="display: inline;"> In medical imaging, developing generalized segmentation models that can handle multiple organs and lesions is crucial. However, the scarcity of fully annotated datasets and strict privacy regulations present significant barriers to data sharing. Federated Learning (FL) allows decentralized model training, but existing FL methods often struggle with partial labeling, leading to model divergence and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18833v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18833v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18833v1-abstract-full" style="display: none;"> In medical imaging, developing generalized segmentation models that can handle multiple organs and lesions is crucial. However, the scarcity of fully annotated datasets and strict privacy regulations present significant barriers to data sharing. Federated Learning (FL) allows decentralized model training, but existing FL methods often struggle with partial labeling, leading to model divergence and catastrophic forgetting. We propose ConDistFL, a novel FL framework incorporating conditional distillation to address these challenges. ConDistFL enables effective learning from partially labeled datasets, significantly improving segmentation accuracy across distributed and non-uniform datasets. In addition to its superior segmentation performance, ConDistFL maintains computational and communication efficiency, ensuring its scalability for real-world applications. Furthermore, ConDistFL demonstrates remarkable generalizability, significantly outperforming existing FL methods in out-of-federation tests, even adapting to unseen contrast phases (e.g., non-contrast CT images) in our experiments. Extensive evaluations on 3D CT and 2D chest X-ray datasets show that ConDistFL is an efficient, adaptable solution for collaborative medical image segmentation in privacy-constrained settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18833v1-abstract-full').style.display = 'none'; document.getElementById('2412.18833v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16720">arXiv:2412.16720</a> <span> [<a href="https://arxiv.org/pdf/2412.16720">pdf</a>, <a href="https://arxiv.org/format/2412.16720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OpenAI o1 System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Jaech%2C+A">Aaron Jaech</a>, <a href="/search/cs?searchtype=author&query=Kalai%2C+A">Adam Kalai</a>, <a href="/search/cs?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&query=Richardson%2C+A">Adam Richardson</a>, <a href="/search/cs?searchtype=author&query=El-Kishky%2C+A">Ahmed El-Kishky</a>, <a href="/search/cs?searchtype=author&query=Low%2C+A">Aiden Low</a>, <a href="/search/cs?searchtype=author&query=Helyar%2C+A">Alec Helyar</a>, <a href="/search/cs?searchtype=author&query=Madry%2C+A">Aleksander Madry</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&query=Iftimie%2C+A">Alex Iftimie</a>, <a href="/search/cs?searchtype=author&query=Karpenko%2C+A">Alex Karpenko</a>, <a href="/search/cs?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&query=Neitz%2C+A">Alexander Neitz</a>, <a href="/search/cs?searchtype=author&query=Prokofiev%2C+A">Alexander Prokofiev</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+A">Alexander Wei</a>, <a href="/search/cs?searchtype=author&query=Tam%2C+A">Allison Tam</a>, <a href="/search/cs?searchtype=author&query=Bennett%2C+A">Ally Bennett</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Ananya Kumar</a>, <a href="/search/cs?searchtype=author&query=Saraiva%2C+A">Andre Saraiva</a>, <a href="/search/cs?searchtype=author&query=Vallone%2C+A">Andrea Vallone</a>, <a href="/search/cs?searchtype=author&query=Duberstein%2C+A">Andrew Duberstein</a>, <a href="/search/cs?searchtype=author&query=Kondrich%2C+A">Andrew Kondrich</a> , et al. (238 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16720v1-abstract-short" style="display: inline;"> The o1 model series is trained with large-scale reinforcement learning to reason using chain of thought. These advanced reasoning capabilities provide new avenues for improving the safety and robustness of our models. In particular, our models can reason about our safety policies in context when responding to potentially unsafe prompts, through deliberative alignment. This leads to state-of-the-ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16720v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16720v1-abstract-full" style="display: none;"> The o1 model series is trained with large-scale reinforcement learning to reason using chain of thought. These advanced reasoning capabilities provide new avenues for improving the safety and robustness of our models. In particular, our models can reason about our safety policies in context when responding to potentially unsafe prompts, through deliberative alignment. This leads to state-of-the-art performance on certain benchmarks for risks such as generating illicit advice, choosing stereotyped responses, and succumbing to known jailbreaks. Training models to incorporate a chain of thought before answering has the potential to unlock substantial benefits, while also increasing potential risks that stem from heightened intelligence. Our results underscore the need for building robust alignment methods, extensively stress-testing their efficacy, and maintaining meticulous risk management protocols. This report outlines the safety work carried out for the OpenAI o1 and OpenAI o1-mini models, including safety evaluations, external red teaming, and Preparedness Framework evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16720v1-abstract-full').style.display = 'none'; document.getElementById('2412.16720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16583">arXiv:2412.16583</a> <span> [<a href="https://arxiv.org/pdf/2412.16583">pdf</a>, <a href="https://arxiv.org/format/2412.16583">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> REO-VLM: Transforming VLM to Meet Regression Challenges in Earth Observation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xizhe Xue</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+G">Guoting Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haokui Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Feng Lin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X+X">Xiao Xiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16583v1-abstract-short" style="display: inline;"> The rapid evolution of Vision Language Models (VLMs) has catalyzed significant advancements in artificial intelligence, expanding research across various disciplines, including Earth Observation (EO). While VLMs have enhanced image understanding and data processing within EO, their applications have predominantly focused on image content description. This limited focus overlooks their potential in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16583v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16583v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16583v1-abstract-full" style="display: none;"> The rapid evolution of Vision Language Models (VLMs) has catalyzed significant advancements in artificial intelligence, expanding research across various disciplines, including Earth Observation (EO). While VLMs have enhanced image understanding and data processing within EO, their applications have predominantly focused on image content description. This limited focus overlooks their potential in geographic and scientific regression tasks, which are essential for diverse EO applications. To bridge this gap, this paper introduces a novel benchmark dataset, called \textbf{REO-Instruct} to unify regression and generation tasks specifically for the EO domain. Comprising 1.6 million multimodal EO imagery and language pairs, this dataset is designed to support both biomass regression and image content interpretation tasks. Leveraging this dataset, we develop \textbf{REO-VLM}, a groundbreaking model that seamlessly integrates regression capabilities with traditional generative functions. By utilizing language-driven reasoning to incorporate scientific domain knowledge, REO-VLM goes beyond solely relying on EO imagery, enabling comprehensive interpretation of complex scientific attributes from EO data. This approach establishes new performance benchmarks and significantly enhances the capabilities of environmental monitoring and resource management. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16583v1-abstract-full').style.display = 'none'; document.getElementById('2412.16583v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13817">arXiv:2412.13817</a> <span> [<a href="https://arxiv.org/pdf/2412.13817">pdf</a>, <a href="https://arxiv.org/format/2412.13817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Nullu: Mitigating Object Hallucinations in Large Vision-Language Models via HalluSpace Projection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+L">Le Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Ziwei Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boxu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhengyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenhao Lin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13817v2-abstract-short" style="display: inline;"> Recent studies have shown that large vision-language models (LVLMs) often suffer from the issue of object hallucinations (OH). To mitigate this issue, we introduce an efficient method that edits the model weights based on an unsafe subspace, which we call HalluSpace in this paper. With truthful and hallucinated text prompts accompanying the visual content as inputs, the HalluSpace can be identifie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13817v2-abstract-full').style.display = 'inline'; document.getElementById('2412.13817v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13817v2-abstract-full" style="display: none;"> Recent studies have shown that large vision-language models (LVLMs) often suffer from the issue of object hallucinations (OH). To mitigate this issue, we introduce an efficient method that edits the model weights based on an unsafe subspace, which we call HalluSpace in this paper. With truthful and hallucinated text prompts accompanying the visual content as inputs, the HalluSpace can be identified by extracting the hallucinated embedding features and removing the truthful representations in LVLMs. By orthogonalizing the model weights, input features will be projected into the Null space of the HalluSpace to reduce OH, based on which we name our method Nullu. We reveal that HalluSpaces generally contain statistical bias and unimodal priors of the large language models (LLMs) applied to build LVLMs, which have been shown as essential causes of OH in previous studies. Therefore, null space projection suppresses the LLMs' priors to filter out the hallucinated features, resulting in contextually accurate outputs. Experiments show that our method can effectively mitigate OH across different LVLM families without extra inference costs and also show strong performance in general LVLM benchmarks. Code is released at \url{https://github.com/Ziwei-Zheng/Nullu}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13817v2-abstract-full').style.display = 'none'; document.getElementById('2412.13817v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13667">arXiv:2412.13667</a> <span> [<a href="https://arxiv.org/pdf/2412.13667">pdf</a>, <a href="https://arxiv.org/format/2412.13667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Exploring Multi-Modal Integration with Tool-Augmented LLM Agents for Precise Causal Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+C">ChengAo Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengzhang Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Dongsheng Luo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dongkuan Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jingchao Ni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13667v1-abstract-short" style="display: inline;"> Causal inference is an imperative foundation for decision-making across domains, such as smart health, AI for drug discovery and AIOps. Traditional statistical causal discovery methods, while well-established, predominantly rely on observational data and often overlook the semantic cues inherent in cause-and-effect relationships. The advent of Large Language Models (LLMs) has ushered in an afforda… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13667v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13667v1-abstract-full" style="display: none;"> Causal inference is an imperative foundation for decision-making across domains, such as smart health, AI for drug discovery and AIOps. Traditional statistical causal discovery methods, while well-established, predominantly rely on observational data and often overlook the semantic cues inherent in cause-and-effect relationships. The advent of Large Language Models (LLMs) has ushered in an affordable way of leveraging the semantic cues for knowledge-driven causal discovery, but the development of LLMs for causal discovery lags behind other areas, particularly in the exploration of multi-modality data. To bridge the gap, we introduce MATMCD, a multi-agent system powered by tool-augmented LLMs. MATMCD has two key agents: a Data Augmentation agent that retrieves and processes modality-augmented data, and a Causal Constraint agent that integrates multi-modal data for knowledge-driven inference. Delicate design of the inner-workings ensures successful cooperation of the agents. Our empirical study across seven datasets suggests the significant potential of multi-modality enhanced causal discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13667v1-abstract-full').style.display = 'none'; document.getElementById('2412.13667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11998">arXiv:2412.11998</a> <span> [<a href="https://arxiv.org/pdf/2412.11998">pdf</a>, <a href="https://arxiv.org/format/2412.11998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAMIC: Segment Anything with In-Context Spatial Prompt Engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nagendra%2C+S">Savinay Nagendra</a>, <a href="/search/cs?searchtype=author&query=Rashid%2C+K">Kashif Rashid</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chaopeng Shen</a>, <a href="/search/cs?searchtype=author&query=Kifer%2C+D">Daniel Kifer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11998v1-abstract-short" style="display: inline;"> Few-shot segmentation is the problem of learning to identify specific types of objects (e.g., airplanes) in images from a small set of labeled reference images. The current state of the art is driven by resource-intensive construction of models for every new domain-specific application. Such models must be trained on enormous labeled datasets of unrelated objects (e.g., cars, trains, animals) so t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11998v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11998v1-abstract-full" style="display: none;"> Few-shot segmentation is the problem of learning to identify specific types of objects (e.g., airplanes) in images from a small set of labeled reference images. The current state of the art is driven by resource-intensive construction of models for every new domain-specific application. Such models must be trained on enormous labeled datasets of unrelated objects (e.g., cars, trains, animals) so that their ``knowledge'' can be transferred to new types of objects. In this paper, we show how to leverage existing vision foundation models (VFMs) to reduce the incremental cost of creating few-shot segmentation models for new domains. Specifically, we introduce SAMIC, a small network that learns how to prompt VFMs in order to segment new types of objects in domain-specific applications. SAMIC enables any task to be approached as a few-shot learning problem. At 2.6 million parameters, it is 94% smaller than the leading models (e.g., having ResNet 101 backbone with 45+ million parameters). Even using 1/5th of the training data provided by one-shot benchmarks, SAMIC is competitive with, or sets the state of the art, on a variety of few-shot and semantic segmentation datasets including COCO-$20^i$, Pascal-$5^i$, PerSeg, FSS-1000, and NWPU VHR-10. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11998v1-abstract-full').style.display = 'none'; document.getElementById('2412.11998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11159">arXiv:2412.11159</a> <span> [<a href="https://arxiv.org/pdf/2412.11159">pdf</a>, <a href="https://arxiv.org/format/2412.11159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> A Report on Financial Regulations Challenge at COLING 2025 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Keyi Wang</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+J">Jaisal Patel</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Charlie Shen</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Daniel Kim</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+A">Andy Zhu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+A">Alex Lin</a>, <a href="/search/cs?searchtype=author&query=Borella%2C+L">Luca Borella</a>, <a href="/search/cs?searchtype=author&query=Osborne%2C+C">Cailean Osborne</a>, <a href="/search/cs?searchtype=author&query=White%2C+M">Matt White</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Steve Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+K">Kairong Xiao</a>, <a href="/search/cs?searchtype=author&query=Yanglet%2C+X+L">Xiao-Yang Liu Yanglet</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11159v2-abstract-short" style="display: inline;"> Financial large language models (FinLLMs) have been applied to various tasks in business, finance, accounting, and auditing. Complex financial regulations and standards are critical to financial services, which LLMs must comply with. However, FinLLMs' performance in understanding and interpreting financial regulations has rarely been studied. Therefore, we organize the Regulations Challenge, a sha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11159v2-abstract-full').style.display = 'inline'; document.getElementById('2412.11159v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11159v2-abstract-full" style="display: none;"> Financial large language models (FinLLMs) have been applied to various tasks in business, finance, accounting, and auditing. Complex financial regulations and standards are critical to financial services, which LLMs must comply with. However, FinLLMs' performance in understanding and interpreting financial regulations has rarely been studied. Therefore, we organize the Regulations Challenge, a shared task at COLING 2025. It encourages the academic community to explore the strengths and limitations of popular LLMs. We create 9 novel tasks and corresponding question sets. In this paper, we provide an overview of these tasks and summarize participants' approaches and results. We aim to raise awareness of FinLLMs' professional capability in financial regulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11159v2-abstract-full').style.display = 'none'; document.getElementById('2412.11159v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09032">arXiv:2412.09032</a> <span> [<a href="https://arxiv.org/pdf/2412.09032">pdf</a>, <a href="https://arxiv.org/format/2412.09032">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.24963/ijcai.2024/46">10.24963/ijcai.2024/46 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Speech-Forensics: Towards Comprehensive Synthetic Speech Dataset Establishment and Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Zhoulin Ji</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenhao Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hang Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09032v2-abstract-short" style="display: inline;"> Detecting synthetic from real speech is increasingly crucial due to the risks of misinformation and identity impersonation. While various datasets for synthetic speech analysis have been developed, they often focus on specific areas, limiting their utility for comprehensive research. To fill this gap, we propose the Speech-Forensics dataset by extensively covering authentic, synthetic, and partial… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09032v2-abstract-full').style.display = 'inline'; document.getElementById('2412.09032v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09032v2-abstract-full" style="display: none;"> Detecting synthetic from real speech is increasingly crucial due to the risks of misinformation and identity impersonation. While various datasets for synthetic speech analysis have been developed, they often focus on specific areas, limiting their utility for comprehensive research. To fill this gap, we propose the Speech-Forensics dataset by extensively covering authentic, synthetic, and partially forged speech samples that include multiple segments synthesized by different high-quality algorithms. Moreover, we propose a TEmporal Speech LocalizaTion network, called TEST, aiming at simultaneously performing authenticity detection, multiple fake segments localization, and synthesis algorithms recognition, without any complex post-processing. TEST effectively integrates LSTM and Transformer to extract more powerful temporal speech representations and utilizes dense prediction on multi-scale pyramid features to estimate the synthetic spans. Our model achieves an average mAP of 83.55% and an EER of 5.25% at the utterance level. At the segment level, it attains an EER of 1.07% and a 92.19% F1 score. These results highlight the model's robust capability for a comprehensive analysis of synthetic speech, offering a promising avenue for future research and practical applications in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09032v2-abstract-full').style.display = 'none'; document.getElementById('2412.09032v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> in Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence, IJCAI-24, 2024, pp. 413-421 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06308">arXiv:2412.06308</a> <span> [<a href="https://arxiv.org/pdf/2412.06308">pdf</a>, <a href="https://arxiv.org/format/2412.06308">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PRECISE: Pre-training Sequential Recommenders with Collaborative and Semantic Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+C">Chonggang Song</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunxu Shen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hao Gu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yaoming Wu</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+L">Lingling Yi</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jie Wen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chuan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06308v1-abstract-short" style="display: inline;"> Real-world recommendation systems commonly offer diverse content scenarios for users to interact with. Considering the enormous number of users in industrial platforms, it is infeasible to utilize a single unified recommendation model to meet the requirements of all scenarios. Usually, separate recommendation pipelines are established for each distinct scenario. This practice leads to challenges i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06308v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06308v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06308v1-abstract-full" style="display: none;"> Real-world recommendation systems commonly offer diverse content scenarios for users to interact with. Considering the enormous number of users in industrial platforms, it is infeasible to utilize a single unified recommendation model to meet the requirements of all scenarios. Usually, separate recommendation pipelines are established for each distinct scenario. This practice leads to challenges in comprehensively grasping users' interests. Recent research endeavors have been made to tackle this problem by pre-training models to encapsulate the overall interests of users. Traditional pre-trained recommendation models mainly capture user interests by leveraging collaborative signals. Nevertheless, a prevalent drawback of these systems is their incapacity to handle long-tail items and cold-start scenarios. With the recent advent of large language models, there has been a significant increase in research efforts focused on exploiting LLMs to extract semantic information for users and items. However, text-based recommendations highly rely on elaborate feature engineering and frequently fail to capture collaborative similarities. To overcome these limitations, we propose a novel pre-training framework for sequential recommendation, termed PRECISE. This framework combines collaborative signals with semantic information. Moreover, PRECISE employs a learning framework that initially models users' comprehensive interests across all recommendation scenarios and subsequently concentrates on the specific interests of target-scene behaviors. We demonstrate that PRECISE precisely captures the entire range of user interests and effectively transfers them to the target interests. Empirical findings reveal that the PRECISE framework attains outstanding performance on both public and industrial datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06308v1-abstract-full').style.display = 'none'; document.getElementById('2412.06308v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03908">arXiv:2412.03908</a> <span> [<a href="https://arxiv.org/pdf/2412.03908">pdf</a>, <a href="https://arxiv.org/format/2412.03908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Can Targeted Clean-Label Poisoning Attacks Generalize? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhizhen Chen</a>, <a href="/search/cs?searchtype=author&query=Dutta%2C+S+K">Subrat Kishore Dutta</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhengyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenhao Lin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03908v1-abstract-short" style="display: inline;"> Targeted poisoning attacks aim to compromise the model's prediction on specific target samples. In a common clean-label setting, they are achieved by slightly perturbing a subset of training samples given access to those specific targets. Despite continuous efforts, it remains unexplored whether such attacks can generalize to unknown variations of those targets. In this paper, we take the first st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03908v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03908v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03908v1-abstract-full" style="display: none;"> Targeted poisoning attacks aim to compromise the model's prediction on specific target samples. In a common clean-label setting, they are achieved by slightly perturbing a subset of training samples given access to those specific targets. Despite continuous efforts, it remains unexplored whether such attacks can generalize to unknown variations of those targets. In this paper, we take the first step to systematically study this generalization problem. Observing that the widely adopted, cosine similarity-based attack exhibits limited generalizability, we propose a well-generalizable attack that leverages both the direction and magnitude of model gradients. In particular, we explore diverse target variations, such as an object with varied viewpoints and an animal species with distinct appearances. Extensive experiments across various generalization scenarios demonstrate that our method consistently achieves the best attack effectiveness. For example, our method outperforms the cosine similarity-based attack by 20.95% in attack success rate with similar overall accuracy, averaged over four models on two image benchmark datasets. The code is available at https://github.com/jiaangk/generalizable_tcpa <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03908v1-abstract-full').style.display = 'none'; document.getElementById('2412.03908v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03293">arXiv:2412.03293</a> <span> [<a href="https://arxiv.org/pdf/2412.03293">pdf</a>, <a href="https://arxiv.org/format/2412.03293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion-VLA: Scaling Robot Foundation Models via Unified Diffusion and Autoregression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+J">Junjie Wen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minjie Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yichen Zhu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhibin Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinming Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhongyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengmeng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyu Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yaxin Peng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chaomin Shen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Feifei Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03293v1-abstract-short" style="display: inline;"> In this paper, we present DiffusionVLA, a novel framework that seamlessly combines the autoregression model with the diffusion model for learning visuomotor policy. Central to our approach is a next-token prediction objective, enabling the model to reason effectively over the user's query in the context of current observations. Subsequently, a diffusion model is attached to generate robust action… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03293v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03293v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03293v1-abstract-full" style="display: none;"> In this paper, we present DiffusionVLA, a novel framework that seamlessly combines the autoregression model with the diffusion model for learning visuomotor policy. Central to our approach is a next-token prediction objective, enabling the model to reason effectively over the user's query in the context of current observations. Subsequently, a diffusion model is attached to generate robust action outputs. To enhance policy learning through self-reasoning, we introduce a novel reasoning injection module that integrates reasoning phrases directly into the policy learning process. The whole framework is simple and flexible, making it easy to deploy and upgrade. We conduct extensive experiments using multiple real robots to validate the effectiveness of DiffusionVLA. Our tests include a challenging factory sorting task, where DiffusionVLA successfully categorizes objects, including those not seen during training. We observe that the reasoning module makes the model interpretable. It allows observers to understand the model thought process and identify potential causes of policy failures. Additionally, we test DiffusionVLA on a zero-shot bin-picking task, achieving 63.7\% accuracy on 102 previously unseen objects. Our method demonstrates robustness to visual changes, such as distractors and new backgrounds, and easily adapts to new embodiments. Furthermore, DiffusionVLA can follow novel instructions and retain conversational ability. Notably, DiffusionVLA is data-efficient and fast at inference; our smallest DiffusionVLA-2B runs 82Hz on a single A6000 GPU and can train from scratch on less than 50 demonstrations for a complex task. Finally, we scale the model from 2B to 72B parameters, showcasing improved generalization capabilities with increased model size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03293v1-abstract-full').style.display = 'none'; document.getElementById('2412.03293v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The project page is available at: http://diffusion-vla.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15262">arXiv:2411.15262</a> <span> [<a href="https://arxiv.org/pdf/2411.15262">pdf</a>, <a href="https://arxiv.org/format/2411.15262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MovieBench: A Hierarchical Movie Level Dataset for Long Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+W">Weijia Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingyu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zeyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xi Xia</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Haoen Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K+Q">Kevin Qinghong Lin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+M+Z">Mike Zheng Shou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15262v1-abstract-short" style="display: inline;"> Recent advancements in video generation models, like Stable Video Diffusion, show promising results, but primarily focus on short, single-scene videos. These models struggle with generating long videos that involve multiple scenes, coherent narratives, and consistent characters. Furthermore, there is no publicly available dataset tailored for the analysis, evaluation, and training of long video ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15262v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15262v1-abstract-full" style="display: none;"> Recent advancements in video generation models, like Stable Video Diffusion, show promising results, but primarily focus on short, single-scene videos. These models struggle with generating long videos that involve multiple scenes, coherent narratives, and consistent characters. Furthermore, there is no publicly available dataset tailored for the analysis, evaluation, and training of long video generation models. In this paper, we present MovieBench: A Hierarchical Movie-Level Dataset for Long Video Generation, which addresses these challenges by providing unique contributions: (1) movie-length videos featuring rich, coherent storylines and multi-scene narratives, (2) consistency of character appearance and audio across scenes, and (3) hierarchical data structure contains high-level movie information and detailed shot-level descriptions. Experiments demonstrate that MovieBench brings some new insights and challenges, such as maintaining character ID consistency across multiple scenes for various characters. The dataset will be public and continuously maintained, aiming to advance the field of long video generation. Data can be found at: https://weijiawu.github.io/MovieBench/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15262v1-abstract-full').style.display = 'none'; document.getElementById('2411.15262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The project website is at: https://weijiawu.github.io/MovieBench/. Code: https://github.com/showlab/MovieBecnh</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15056">arXiv:2411.15056</a> <span> [<a href="https://arxiv.org/pdf/2411.15056">pdf</a>, <a href="https://arxiv.org/format/2411.15056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Financial Risk Assessment via Long-term Payment Behavior Sequence Folding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yiran Qiao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yateng Tang</a>, <a href="/search/cs?searchtype=author&query=Ao%2C+X">Xiang Ao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Q">Qi Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziming Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xuehao Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15056v1-abstract-short" style="display: inline;"> Online inclusive financial services encounter significant financial risks due to their expansive user base and low default costs. By real-world practice, we reveal that utilizing longer-term user payment behaviors can enhance models' ability to forecast financial risks. However, learning long behavior sequences is non-trivial for deep sequential models. Additionally, the diverse fields of payment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15056v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15056v1-abstract-full" style="display: none;"> Online inclusive financial services encounter significant financial risks due to their expansive user base and low default costs. By real-world practice, we reveal that utilizing longer-term user payment behaviors can enhance models' ability to forecast financial risks. However, learning long behavior sequences is non-trivial for deep sequential models. Additionally, the diverse fields of payment behaviors carry rich information, requiring thorough exploitation. These factors collectively complicate the task of long-term user behavior modeling. To tackle these challenges, we propose a Long-term Payment Behavior Sequence Folding method, referred to as LBSF. In LBSF, payment behavior sequences are folded based on merchants, using the merchant field as an intrinsic grouping criterion, which enables informative parallelism without reliance on external knowledge. Meanwhile, we maximize the utility of payment details through a multi-field behavior encoding mechanism. Subsequently, behavior aggregation at the merchant level followed by relational learning across merchants facilitates comprehensive user financial representation. We evaluate LBSF on the financial risk assessment task using a large-scale real-world dataset. The results demonstrate that folding long behavior sequences based on internal behavioral cues effectively models long-term patterns and changes, thereby generating more accurate user financial profiles for practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15056v1-abstract-full').style.display = 'none'; document.getElementById('2411.15056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICDM2024 long paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11205">arXiv:2411.11205</a> <span> [<a href="https://arxiv.org/pdf/2411.11205">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Examining Platformization in Cultural Production: A Comparative Computational Analysis of Hit Songs on TikTok and Spotify </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ta%2C+N">Na Ta</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+F">Fang Jiao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Cong Lin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cuihua Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11205v1-abstract-short" style="display: inline;"> The (re)creation and distribution of cultural products such as music are increasingly shaped by digital platforms. This study explores how TikTok and Spotify, situated in different governance and user contexts, could influence digital music production and reception within each platform and between each other. Focusing on daily hit song charts as the embodiment of platformization, we collected and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11205v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11205v1-abstract-full" style="display: none;"> The (re)creation and distribution of cultural products such as music are increasingly shaped by digital platforms. This study explores how TikTok and Spotify, situated in different governance and user contexts, could influence digital music production and reception within each platform and between each other. Focusing on daily hit song charts as the embodiment of platformization, we collected and analyzed a two-year longitudinal dataset on TikTok and Spotify. We tested the relationships between elements of platformization and hit song popularity within each platform, and examined cross-platform influence flow. Results reveal significant differences in major label, genre, and content features among hit songs on TikTok and Spotify, which can be explained by their distinct platformization practices. We also found some evidence that hit song popularity on Spotify might precede that on TikTok. This study illustrates both the platform-specific mechanisms of TikTok and Spotify and their interconnectedness in the cultural production ecosystem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11205v1-abstract-full').style.display = 'none'; document.getElementById('2411.11205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09968">arXiv:2411.09968</a> <span> [<a href="https://arxiv.org/pdf/2411.09968">pdf</a>, <a href="https://arxiv.org/format/2411.09968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Seeing Clearly by Layer Two: Enhancing Attention Heads to Alleviate Hallucination in LVLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaofeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+Y">Yihao Quan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+C">Chaochen Gu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaosong Yuan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shaotian Yan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kaijie Wu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09968v1-abstract-short" style="display: inline;"> The hallucination problem in multimodal large language models (MLLMs) remains a common issue. Although image tokens occupy a majority of the input sequence of MLLMs, there is limited research to explore the relationship between image tokens and hallucinations. In this paper, we analyze the distribution of attention scores for image tokens across each layer and head of the model, revealing an intri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09968v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09968v1-abstract-full" style="display: none;"> The hallucination problem in multimodal large language models (MLLMs) remains a common issue. Although image tokens occupy a majority of the input sequence of MLLMs, there is limited research to explore the relationship between image tokens and hallucinations. In this paper, we analyze the distribution of attention scores for image tokens across each layer and head of the model, revealing an intriguing and common phenomenon: most hallucinations are closely linked to the pattern of attention sinks in the self-attention matrix of image tokens, where shallow layers exhibit dense attention sinks and deeper layers show sparse attention sinks. We further analyze the attention heads of different layers and find that heads with high-density attention sink in the image part play a positive role in alleviating hallucinations. In this paper, we propose a training-free method named \textcolor{red}{\textbf{E}}nhancing \textcolor{red}{\textbf{A}}ttention \textcolor{red}{\textbf{H}}eads (EAH), an approach designed to enhance the convergence of image tokens attention sinks in the shallow layers. EAH identifies the attention head that shows the vision sink in a shallow layer and extracts its attention matrix. This attention map is then broadcast to other heads in the layer, thereby strengthening the layer to pay more attention to the image itself. With extensive experiments, EAH shows significant hallucination-mitigating performance on different MLLMs and metrics, proving its effectiveness and generality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09968v1-abstract-full').style.display = 'none'; document.getElementById('2411.09968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07600">arXiv:2411.07600</a> <span> [<a href="https://arxiv.org/pdf/2411.07600">pdf</a>, <a href="https://arxiv.org/format/2411.07600">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Decision Feedback In-Context Symbol Detection over Block-Fading Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+L">Li Fan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07600v1-abstract-short" style="display: inline;"> Pre-trained Transformers, through in-context learning (ICL), have demonstrated exceptional capabilities to adapt to new tasks using example prompts \textit{without model update}. Transformer-based wireless receivers, where prompts consist of the pilot data in the form of transmitted and received signal pairs, have shown high estimation accuracy when pilot data are abundant. However, pilot informat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07600v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07600v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07600v1-abstract-full" style="display: none;"> Pre-trained Transformers, through in-context learning (ICL), have demonstrated exceptional capabilities to adapt to new tasks using example prompts \textit{without model update}. Transformer-based wireless receivers, where prompts consist of the pilot data in the form of transmitted and received signal pairs, have shown high estimation accuracy when pilot data are abundant. However, pilot information is often costly and limited in practice. In this work, we propose the \underline{DE}cision \underline{F}eedback \underline{IN}-Cont\underline{E}xt \underline{D}etection (DEFINED) solution as a new wireless receiver design, which bypasses channel estimation and directly performs symbol detection using the (sometimes extremely) limited pilot data. The key innovation in DEFINED is the proposed decision feedback mechanism in ICL, where we sequentially incorporate the detected symbols into the prompts to improve the detections for subsequent symbols. Extensive experiments across a broad range of wireless communication settings demonstrate that DEFINED achieves significant performance improvements, in some cases only needing a single pilot pair. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07600v1-abstract-full').style.display = 'none'; document.getElementById('2411.07600v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07597">arXiv:2411.07597</a> <span> [<a href="https://arxiv.org/pdf/2411.07597">pdf</a>, <a href="https://arxiv.org/format/2411.07597">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Adversarial Machine Learning for Code Data: Realistic Threats, Countermeasures, and Interpretations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yulong Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Haoran Fan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenhao Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qian Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhengyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xiaohong Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07597v1-abstract-short" style="display: inline;"> Code Language Models (CLMs) have achieved tremendous progress in source code understanding and generation, leading to a significant increase in research interests focused on applying CLMs to real-world software engineering tasks in recent years. However, in realistic scenarios, CLMs are exposed to potential malicious adversaries, bringing risks to the confidentiality, integrity, and availability o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07597v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07597v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07597v1-abstract-full" style="display: none;"> Code Language Models (CLMs) have achieved tremendous progress in source code understanding and generation, leading to a significant increase in research interests focused on applying CLMs to real-world software engineering tasks in recent years. However, in realistic scenarios, CLMs are exposed to potential malicious adversaries, bringing risks to the confidentiality, integrity, and availability of CLM systems. Despite these risks, a comprehensive analysis of the security vulnerabilities of CLMs in the extremely adversarial environment has been lacking. To close this research gap, we categorize existing attack techniques into three types based on the CIA triad: poisoning attacks (integrity \& availability infringement), evasion attacks (integrity infringement), and privacy attacks (confidentiality infringement). We have collected so far the most comprehensive (79) papers related to adversarial machine learning for CLM from the research fields of artificial intelligence, computer security, and software engineering. Our analysis covers each type of risk, examining threat model categorization, attack techniques, and countermeasures, while also introducing novel perspectives on eXplainable AI (XAI) and exploring the interconnections between different risks. Finally, we identify current challenges and future research opportunities. This study aims to provide a comprehensive roadmap for both researchers and practitioners and pave the way towards more reliable CLMs for practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07597v1-abstract-full').style.display = 'none'; document.getElementById('2411.07597v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under a reviewing process since Sep. 3, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06221">arXiv:2411.06221</a> <span> [<a href="https://arxiv.org/pdf/2411.06221">pdf</a>, <a href="https://arxiv.org/format/2411.06221">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Smart-LLaMA: Two-Stage Post-Training of Large Language Models for Smart Contract Vulnerability Detection and Explanation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lei Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiqi Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hang Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhirong Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chenjie Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fengjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Li Yang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiajia Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06221v1-abstract-short" style="display: inline;"> With the rapid development of blockchain technology, smart contract security has become a critical challenge. Existing smart contract vulnerability detection methods face three main issues: (1) Insufficient quality of datasets, lacking detailed explanations and precise vulnerability locations. (2) Limited adaptability of large language models (LLMs) to the smart contract domain, as most LLMs are p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06221v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06221v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06221v1-abstract-full" style="display: none;"> With the rapid development of blockchain technology, smart contract security has become a critical challenge. Existing smart contract vulnerability detection methods face three main issues: (1) Insufficient quality of datasets, lacking detailed explanations and precise vulnerability locations. (2) Limited adaptability of large language models (LLMs) to the smart contract domain, as most LLMs are pre-trained on general text data but minimal smart contract-specific data. (3) Lack of high-quality explanations for detected vulnerabilities, as existing methods focus solely on detection without clear explanations. These limitations hinder detection performance and make it harder for developers to understand and fix vulnerabilities quickly, potentially leading to severe financial losses. To address these problems, we propose Smart-LLaMA, an advanced detection method based on the LLaMA language model. First, we construct a comprehensive dataset covering four vulnerability types with labels, detailed explanations, and precise vulnerability locations. Second, we introduce Smart Contract-Specific Continual Pre-Training, using raw smart contract data to enable the LLM to learn smart contract syntax and semantics, enhancing their domain adaptability. Furthermore, we propose Explanation-Guided Fine-Tuning, which fine-tunes the LLM using paired vulnerable code and explanations, enabling both vulnerability detection and reasoned explanations. We evaluate explanation quality through LLM and human evaluation, focusing on Correctness, Completeness, and Conciseness. Experimental results show that Smart-LLaMA outperforms state-of-the-art baselines, with average improvements of 6.49% in F1 score and 3.78% in accuracy, while providing reliable explanations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06221v1-abstract-full').style.display = 'none'; document.getElementById('2411.06221v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01904">arXiv:2411.01904</a> <span> [<a href="https://arxiv.org/pdf/2411.01904">pdf</a>, <a href="https://arxiv.org/format/2411.01904">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FPPL: An Efficient and Non-IID Robust Federated Continual Learning Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuchen He</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chuyun Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bo Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01904v3-abstract-short" style="display: inline;"> Federated continual learning (FCL) aims to learn from sequential data stream in the decentralized federated learning setting, while simultaneously mitigating the catastrophic forgetting issue in classical continual learning. Existing FCL methods usually employ typical rehearsal mechanisms, which could result in privacy violations or additional onerous storage and computational burdens. In this wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01904v3-abstract-full').style.display = 'inline'; document.getElementById('2411.01904v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01904v3-abstract-full" style="display: none;"> Federated continual learning (FCL) aims to learn from sequential data stream in the decentralized federated learning setting, while simultaneously mitigating the catastrophic forgetting issue in classical continual learning. Existing FCL methods usually employ typical rehearsal mechanisms, which could result in privacy violations or additional onerous storage and computational burdens. In this work, an efficient and non-IID robust federated continual learning framework, called Federated Prototype-Augmented Prompt Learning (FPPL), is proposed. The FPPL can collaboratively learn lightweight prompts augmented by prototypes without rehearsal. On the client side, a fusion function is employed to fully leverage the knowledge contained in task-specific prompts for alleviating catastrophic forgetting. Additionally, global prototypes aggregated from the server are used to obtain unified representation through contrastive learning, mitigating the impact of non-IID-derived data heterogeneity. On the server side, locally uploaded prototypes are utilized to perform debiasing on the classifier, further alleviating the performance degradation caused by both non-IID and catastrophic forgetting. Empirical evaluations demonstrate the effectiveness of FPPL, achieving notable performance with an efficient design while remaining robust to diverse non-IID degrees. Code is available at: https://github.com/ycheoo/FPPL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01904v3-abstract-full').style.display = 'none'; document.getElementById('2411.01904v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00332">arXiv:2411.00332</a> <span> [<a href="https://arxiv.org/pdf/2411.00332">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Mesoscale and Nanoscale Physics">cond-mat.mes-hall</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> In-situ Self-optimization of Quantum Dot Emission for Lasers by Machine-Learning Assisted Epitaxy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+W">Wenkang Zhan</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Shujie Pan</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+H">Hongyue Hao</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+N">Ning Zhuo</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+K">Kaiyao Xin</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+H">Hui Cong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chi Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bo Xu</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+T+K">Tien Khee Ng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siming Chen</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+C">Chunlai Xue</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fengqi Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhanguo Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chao Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00332v1-abstract-short" style="display: inline;"> Traditional methods for optimizing light source emissions rely on a time-consuming trial-and-error approach. While in-situ optimization of light source gain media emission during growth is ideal, it has yet to be realized. In this work, we integrate in-situ reflection high-energy electron diffraction (RHEED) with machine learning (ML) to correlate the surface reconstruction with the photoluminesce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00332v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00332v1-abstract-full" style="display: none;"> Traditional methods for optimizing light source emissions rely on a time-consuming trial-and-error approach. While in-situ optimization of light source gain media emission during growth is ideal, it has yet to be realized. In this work, we integrate in-situ reflection high-energy electron diffraction (RHEED) with machine learning (ML) to correlate the surface reconstruction with the photoluminescence (PL) of InAs/GaAs quantum dots (QDs), which serve as the active region of lasers. A lightweight ResNet-GLAM model is employed for the real-time processing of RHEED data as input, enabling effective identification of optical performance. This approach guides the dynamic optimization of growth parameters, allowing real-time feedback control to adjust the QDs emission for lasers. We successfully optimized InAs QDs on GaAs substrates, with a 3.2-fold increase in PL intensity and a reduction in full width at half maximum (FWHM) from 36.69 meV to 28.17 meV under initially suboptimal growth conditions. Our automated, in-situ self-optimized lasers with 5-layer InAs QDs achieved electrically pumped continuous-wave operation at 1240 nm with a low threshold current of 150 A/cm2 at room temperature, an excellent performance comparable to samples grown through traditional manual multi-parameter optimization methods. These results mark a significant step toward intelligent, low-cost, and reproductive light emitters production. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00332v1-abstract-full').style.display = 'none'; document.getElementById('2411.00332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23166">arXiv:2410.23166</a> <span> [<a href="https://arxiv.org/pdf/2410.23166">pdf</a>, <a href="https://arxiv.org/format/2410.23166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SciPIP: An LLM-based Scientific Paper Idea Proposer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lihui Gu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liye Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yunxiang Luo</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yi Dai</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Liang Xie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23166v1-abstract-short" style="display: inline;"> The exponential growth of knowledge and the increasing complexity of interdisciplinary research pose significant challenges for researchers, including information overload and difficulties in exploring novel ideas. The advancements in large language models (LLMs), such as GPT-4, have shown great potential in enhancing idea proposals, but how to effectively utilize large models for reasonable idea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23166v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23166v1-abstract-full" style="display: none;"> The exponential growth of knowledge and the increasing complexity of interdisciplinary research pose significant challenges for researchers, including information overload and difficulties in exploring novel ideas. The advancements in large language models (LLMs), such as GPT-4, have shown great potential in enhancing idea proposals, but how to effectively utilize large models for reasonable idea proposal has not been thoroughly explored. This paper proposes a scientific paper idea proposer (SciPIP). Based on a user-provided research background, SciPIP retrieves helpful papers from a literature database while leveraging the capabilities of LLMs to generate more novel and feasible ideas. To this end, 1) we construct a literature retrieval database, extracting lots of papers' multi-dimension information for fast access. Then, a literature retrieval method based on semantics, entity, and citation co-occurrences is proposed to search relevant literature from multiple aspects based on the user-provided background. 2) After literature retrieval, we introduce dual-path idea proposal strategies, where one path infers solutions from the retrieved literature and the other path generates original ideas through model brainstorming. We then combine the two to achieve a good balance between feasibility and originality. Through extensive experiments on the natural language processing (NLP) field, we demonstrate that SciPIP can retrieve citations similar to those of existing top conference papers and generate many ideas consistent with them. Additionally, we evaluate the originality of other ideas generated by SciPIP using large language models, further validating the effectiveness of our proposed method. The code and the database are released at https://github.com/cheerss/SciPIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23166v1-abstract-full').style.display = 'none'; document.getElementById('2410.23166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 5 figures, 19 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22984">arXiv:2410.22984</a> <span> [<a href="https://arxiv.org/pdf/2410.22984">pdf</a>, <a href="https://arxiv.org/format/2410.22984">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Higher-order Cross-structural Embedding Model for Time Series Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+G">Guancen Lin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+A">Aijing Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22984v1-abstract-short" style="display: inline;"> Time series analysis has gained significant attention due to its critical applications in diverse fields such as healthcare, finance, and sensor networks. The complexity and non-stationarity of time series make it challenging to capture the interaction patterns across different timestamps. Current approaches struggle to model higher-order interactions within time series, and focus on learning temp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22984v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22984v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22984v1-abstract-full" style="display: none;"> Time series analysis has gained significant attention due to its critical applications in diverse fields such as healthcare, finance, and sensor networks. The complexity and non-stationarity of time series make it challenging to capture the interaction patterns across different timestamps. Current approaches struggle to model higher-order interactions within time series, and focus on learning temporal or spatial dependencies separately, which limits performance in downstream tasks. To address these gaps, we propose Higher-order Cross-structural Embedding Model for Time Series (High-TS), a novel framework that jointly models both temporal and spatial perspectives by combining multiscale Transformer with Topological Deep Learning (TDL). Meanwhile, High-TS utilizes contrastive learning to integrate these two structures for generating robust and discriminative representations. Extensive experiments show that High-TS outperforms state-of-the-art methods in various time series tasks and demonstrate the importance of higher-order cross-structural information in improving model performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22984v1-abstract-full').style.display = 'none'; document.getElementById('2410.22984v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18978">arXiv:2410.18978</a> <span> [<a href="https://arxiv.org/pdf/2410.18978">pdf</a>, <a href="https://arxiv.org/format/2410.18978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Framer: Interactive Frame Interpolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kecheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+H">Hao Ouyang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhekai Chen</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+B">Biao Gong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yujun Shen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18978v2-abstract-short" style="display: inline;"> We propose Framer for interactive frame interpolation, which targets producing smoothly transitioning frames between two images as per user creativity. Concretely, besides taking the start and end frames as inputs, our approach supports customizing the transition process by tailoring the trajectory of some selected keypoints. Such a design enjoys two clear benefits. First, incorporating human inte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18978v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18978v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18978v2-abstract-full" style="display: none;"> We propose Framer for interactive frame interpolation, which targets producing smoothly transitioning frames between two images as per user creativity. Concretely, besides taking the start and end frames as inputs, our approach supports customizing the transition process by tailoring the trajectory of some selected keypoints. Such a design enjoys two clear benefits. First, incorporating human interaction mitigates the issue arising from numerous possibilities of transforming one image to another, and in turn enables finer control of local motions. Second, as the most basic form of interaction, keypoints help establish the correspondence across frames, enhancing the model to handle challenging cases (e.g., objects on the start and end frames are of different shapes and styles). It is noteworthy that our system also offers an "autopilot" mode, where we introduce a module to estimate the keypoints and refine the trajectory automatically, to simplify the usage in practice. Extensive experimental results demonstrate the appealing performance of Framer on various applications, such as image morphing, time-lapse video generation, cartoon interpolation, etc. The code, the model, and the interface will be released to facilitate further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18978v2-abstract-full').style.display = 'none'; document.getElementById('2410.18978v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://aim-uofa.github.io/Framer/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13981">arXiv:2410.13981</a> <span> [<a href="https://arxiv.org/pdf/2410.13981">pdf</a>, <a href="https://arxiv.org/format/2410.13981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> On the Learn-to-Optimize Capabilities of Transformers in In-Context Sparse Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+R">Renpu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+R">Ruida Zhou</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13981v1-abstract-short" style="display: inline;"> An intriguing property of the Transformer is its ability to perform in-context learning (ICL), where the Transformer can solve different inference tasks without parameter updating based on the contextual information provided by the corresponding input-output demonstration pairs. It has been theoretically proved that ICL is enabled by the capability of Transformers to perform gradient-descent algor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13981v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13981v1-abstract-full" style="display: none;"> An intriguing property of the Transformer is its ability to perform in-context learning (ICL), where the Transformer can solve different inference tasks without parameter updating based on the contextual information provided by the corresponding input-output demonstration pairs. It has been theoretically proved that ICL is enabled by the capability of Transformers to perform gradient-descent algorithms (Von Oswald et al., 2023a; Bai et al., 2024). This work takes a step further and shows that Transformers can perform learning-to-optimize (L2O) algorithms. Specifically, for the ICL sparse recovery (formulated as LASSO) tasks, we show that a K-layer Transformer can perform an L2O algorithm with a provable convergence rate linear in K. This provides a new perspective explaining the superior ICL capability of Transformers, even with only a few layers, which cannot be achieved by the standard gradient-descent algorithms. Moreover, unlike the conventional L2O algorithms that require the measurement matrix involved in training to match that in testing, the trained Transformer is able to solve sparse recovery problems generated with different measurement matrices. Besides, Transformers as an L2O algorithm can leverage structural information embedded in the training tasks to accelerate its convergence during ICL, and generalize across different lengths of demonstration pairs, where conventional L2O algorithms typically struggle or fail. Such theoretical findings are supported by our experimental results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13981v1-abstract-full').style.display = 'none'; document.getElementById('2410.13981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13056">arXiv:2410.13056</a> <span> [<a href="https://arxiv.org/pdf/2410.13056">pdf</a>, <a href="https://arxiv.org/format/2410.13056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Channel-Wise Mixed-Precision Quantization for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zihan Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+B">Bike Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13056v3-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable success across a wide range of language tasks, but their deployment on edge devices remains challenging due to the substantial memory requirements imposed by their large parameter sizes. Weight-only quantization presents a promising solution to reduce the memory footprint of LLMs. However, existing approaches primarily focus on integer-bit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13056v3-abstract-full').style.display = 'inline'; document.getElementById('2410.13056v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13056v3-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable success across a wide range of language tasks, but their deployment on edge devices remains challenging due to the substantial memory requirements imposed by their large parameter sizes. Weight-only quantization presents a promising solution to reduce the memory footprint of LLMs. However, existing approaches primarily focus on integer-bit quantization, limiting their adaptability to fractional-bit quantization tasks and preventing the full utilization of available storage space on devices. In this paper, we introduce Channel-Wise Mixed-Precision Quantization (CMPQ), a novel mixed-precision quantization method that allocates quantization precision in a channel-wise pattern based on activation distributions. By assigning different precision levels to different weight channels, CMPQ can adapt to any bit-width constraint. CMPQ employs a non-uniform quantization strategy and incorporates two outlier extraction techniques that collaboratively preserve the critical information, thereby minimizing the quantization loss. Experiments on different sizes of LLMs demonstrate that CMPQ not only enhances performance in integer-bit quantization tasks but also achieves significant performance gains with a modest increase in memory usage. CMPQ thus represents an adaptive and effective approach to LLM quantization, offering substantial benefits across diverse device capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13056v3-abstract-full').style.display = 'none'; document.getElementById('2410.13056v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12934">arXiv:2410.12934</a> <span> [<a href="https://arxiv.org/pdf/2410.12934">pdf</a>, <a href="https://arxiv.org/format/2410.12934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Mathematical Reasoning in LLMs by Stepwise Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenyu Wu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Qingkai Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaoxuan Tan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Meng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12934v1-abstract-short" style="display: inline;"> Best-of-N decoding methods instruct large language models (LLMs) to generate multiple solutions, score each using a scoring function, and select the highest scored as the final answer to mathematical reasoning problems. However, this repeated independent process often leads to the same mistakes, making the selected solution still incorrect. We propose a novel prompting method named Stepwise Correc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12934v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12934v1-abstract-full" style="display: none;"> Best-of-N decoding methods instruct large language models (LLMs) to generate multiple solutions, score each using a scoring function, and select the highest scored as the final answer to mathematical reasoning problems. However, this repeated independent process often leads to the same mistakes, making the selected solution still incorrect. We propose a novel prompting method named Stepwise Correction (StepCo) that helps LLMs identify and revise incorrect steps in their generated reasoning paths. It iterates verification and revision phases that employ a process-supervised verifier. The verify-then-revise process not only improves answer correctness but also reduces token consumption with fewer paths needed to generate. With StepCo, a series of LLMs demonstrate exceptional performance. Notably, using GPT-4o as the backend LLM, StepCo achieves an average accuracy of 94.1 across eight datasets, significantly outperforming the state-of-the-art Best-of-N method by +2.4, while reducing token consumption by 77.8%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12934v1-abstract-full').style.display = 'none'; document.getElementById('2410.12934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12085">arXiv:2410.12085</a> <span> [<a href="https://arxiv.org/pdf/2410.12085">pdf</a>, <a href="https://arxiv.org/format/2410.12085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Data-adaptive Differentially Private Prompt Synthesis for In-Context Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+F">Fengyu Gao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+R">Ruida Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianhao Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12085v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) rely on the contextual information embedded in examples/demonstrations to perform in-context learning (ICL). To mitigate the risk of LLMs potentially leaking private information contained in examples in the prompt, we introduce a novel data-adaptive differentially private algorithm called AdaDPSyn to generate synthetic examples from the private dataset and then use the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12085v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12085v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12085v1-abstract-full" style="display: none;"> Large Language Models (LLMs) rely on the contextual information embedded in examples/demonstrations to perform in-context learning (ICL). To mitigate the risk of LLMs potentially leaking private information contained in examples in the prompt, we introduce a novel data-adaptive differentially private algorithm called AdaDPSyn to generate synthetic examples from the private dataset and then use these synthetic examples to perform ICL. The objective of AdaDPSyn is to adaptively adjust the noise level in the data synthesis mechanism according to the inherent statistical properties of the data, thereby preserving high ICL accuracy while maintaining formal differential privacy guarantees. A key innovation in AdaDPSyn is the Precision-Focused Iterative Radius Reduction technique, which dynamically refines the aggregation radius - the scope of data grouping for noise addition - based on patterns observed in data clustering, thereby minimizing the amount of additive noise. We conduct extensive experiments on standard benchmarks and compare AdaDPSyn with DP few-shot generation algorithm (Tang et al., 2023). The experiments demonstrate that AdaDPSyn not only outperforms DP few-shot generation, but also maintains high accuracy levels close to those of non-private baselines, providing an effective solution for ICL with privacy protection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12085v1-abstract-full').style.display = 'none'; document.getElementById('2410.12085v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11778">arXiv:2410.11778</a> <span> [<a href="https://arxiv.org/pdf/2410.11778">pdf</a>, <a href="https://arxiv.org/format/2410.11778">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On the Training Convergence of Transformers for In-Context Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+W">Wei Shen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+R">Ruida Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11778v1-abstract-short" style="display: inline;"> While transformers have demonstrated impressive capacities for in-context learning (ICL) in practice, theoretical understanding of the underlying mechanism enabling transformers to perform ICL is still in its infant stage. This work aims to theoretically study the training dynamics of transformers for in-context classification tasks. We demonstrate that, for in-context classification of Gaussian m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11778v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11778v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11778v1-abstract-full" style="display: none;"> While transformers have demonstrated impressive capacities for in-context learning (ICL) in practice, theoretical understanding of the underlying mechanism enabling transformers to perform ICL is still in its infant stage. This work aims to theoretically study the training dynamics of transformers for in-context classification tasks. We demonstrate that, for in-context classification of Gaussian mixtures under certain assumptions, a single-layer transformer trained via gradient descent converges to a globally optimal model at a linear rate. We further quantify the impact of the training and testing prompt lengths on the ICL inference error of the trained transformer. We show that when the lengths of training and testing prompts are sufficiently large, the prediction of the trained transformer approaches the Bayes-optimal classifier. Experimental results corroborate the theoretical findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11778v1-abstract-full').style.display = 'none'; document.getElementById('2410.11778v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11719">arXiv:2410.11719</a> <span> [<a href="https://arxiv.org/pdf/2410.11719">pdf</a>, <a href="https://arxiv.org/format/2410.11719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Coordinators and Prompts on Heterogeneous Graphs for Cross-Domain Recommendations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunxu Shen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiangguo Sun</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+Y">Yu Rong</a>, <a href="/search/cs?searchtype=author&query=Piao%2C+C">Chengzhi Piao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hong Cheng</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+L">Lingling Yi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11719v1-abstract-short" style="display: inline;"> In the online digital world, users frequently engage with diverse items across multiple domains (e.g., e-commerce platforms, streaming services, and social media networks), forming complex heterogeneous interaction graphs. Leveraging this multi-domain information can undoubtedly enhance the performance of recommendation systems by providing more comprehensive user insights and alleviating data spa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11719v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11719v1-abstract-full" style="display: none;"> In the online digital world, users frequently engage with diverse items across multiple domains (e.g., e-commerce platforms, streaming services, and social media networks), forming complex heterogeneous interaction graphs. Leveraging this multi-domain information can undoubtedly enhance the performance of recommendation systems by providing more comprehensive user insights and alleviating data sparsity in individual domains. However, integrating multi-domain knowledge for the cross-domain recommendation is very hard due to inherent disparities in user behavior and item characteristics and the risk of negative transfer, where irrelevant or conflicting information from the source domains adversely impacts the target domain's performance. To address these challenges, we offer HAGO, a novel framework with $\textbf{H}$eterogeneous $\textbf{A}$daptive $\textbf{G}$raph co$\textbf{O}$rdinators, which dynamically integrate multi-domain graphs into a cohesive structure by adaptively adjusting the connections between coordinators and multi-domain graph nodes, thereby enhancing beneficial inter-domain interactions while mitigating negative transfer effects. Additionally, we develop a universal multi-domain graph pre-training strategy alongside HAGO to collaboratively learn high-quality node representations across domains. To effectively transfer the learned multi-domain knowledge to the target domain, we design an effective graph prompting method, which incorporates pre-trained embeddings with learnable prompts for the recommendation task. Our framework is compatible with various graph-based models and pre-training techniques, demonstrating broad applicability and effectiveness. Further experimental results show that our solutions outperform state-of-the-art methods in multi-domain recommendation scenarios and highlight their potential for real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11719v1-abstract-full').style.display = 'none'; document.getElementById('2410.11719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10815">arXiv:2410.10815</a> <span> [<a href="https://arxiv.org/pdf/2410.10815">pdf</a>, <a href="https://arxiv.org/format/2410.10815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Depth Any Video with Scalable Synthetic Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Honghui Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wei Yin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haifeng Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10815v1-abstract-short" style="display: inline;"> Video depth estimation has long been hindered by the scarcity of consistent and scalable ground truth data, leading to inconsistent and unreliable results. In this paper, we introduce Depth Any Video, a model that tackles the challenge through two key innovations. First, we develop a scalable synthetic data pipeline, capturing real-time video depth data from diverse synthetic environments, yieldin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10815v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10815v1-abstract-full" style="display: none;"> Video depth estimation has long been hindered by the scarcity of consistent and scalable ground truth data, leading to inconsistent and unreliable results. In this paper, we introduce Depth Any Video, a model that tackles the challenge through two key innovations. First, we develop a scalable synthetic data pipeline, capturing real-time video depth data from diverse synthetic environments, yielding 40,000 video clips of 5-second duration, each with precise depth annotations. Second, we leverage the powerful priors of generative video diffusion models to handle real-world videos effectively, integrating advanced techniques such as rotary position encoding and flow matching to further enhance flexibility and efficiency. Unlike previous models, which are limited to fixed-length video sequences, our approach introduces a novel mixed-duration training strategy that handles videos of varying lengths and performs robustly across different frame rates-even on single frames. At inference, we propose a depth interpolation method that enables our model to infer high-resolution video depth across sequences of up to 150 frames. Our model outperforms all previous generative depth models in terms of spatial accuracy and temporal consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10815v1-abstract-full').style.display = 'none'; document.getElementById('2410.10815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://depthanyvideo.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10639">arXiv:2410.10639</a> <span> [<a href="https://arxiv.org/pdf/2410.10639">pdf</a>, <a href="https://arxiv.org/format/2410.10639">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Generating Model Parameters for Controlling: Parameter Diffusion for Controllable Multi-Task Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chenglei Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiahao Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weijie Yu</a>, <a href="/search/cs?searchtype=author&query=He%2C+M">Ming He</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jianping Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10639v1-abstract-short" style="display: inline;"> Commercial recommender systems face the challenge that task requirements from platforms or users often change dynamically (e.g., varying preferences for accuracy or diversity). Ideally, the model should be re-trained after resetting a new objective function, adapting to these changes in task requirements. However, in practice, the high computational costs associated with retraining make this proce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10639v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10639v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10639v1-abstract-full" style="display: none;"> Commercial recommender systems face the challenge that task requirements from platforms or users often change dynamically (e.g., varying preferences for accuracy or diversity). Ideally, the model should be re-trained after resetting a new objective function, adapting to these changes in task requirements. However, in practice, the high computational costs associated with retraining make this process impractical for models already deployed to online environments. This raises a new challenging problem: how to efficiently adapt the learning model to different task requirements by controlling model parameters after deployment, without the need for retraining. To address this issue, we propose a novel controllable learning approach via Parameter Diffusion for controllable multi-task Recommendation (PaDiRec), which allows the customization and adaptation of recommendation model parameters to new task requirements without retraining. Specifically, we first obtain the optimized model parameters through adapter tunning based on the feasible task requirements. Then, we utilize the diffusion model as a parameter generator, employing classifier-free guidance in conditional training to learn the distribution of optimized model parameters under various task requirements. Finally, the diffusion model is applied to effectively generate model parameters in a test-time adaptation manner given task requirements. As a model-agnostic approach, PaDiRec can leverage existing recommendation models as backbones to enhance their controllability. Extensive experiments on public datasets and a dataset from a commercial app, indicate that PaDiRec can effectively enhance controllability through efficient model parameter generation. The code is released at https://anonymous.4open.science/r/PaDiRec-DD13. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10639v1-abstract-full').style.display = 'none'; document.getElementById('2410.10639v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10289">arXiv:2410.10289</a> <span> [<a href="https://arxiv.org/pdf/2410.10289">pdf</a>, <a href="https://arxiv.org/format/2410.10289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fine-grained Abnormality Prompt Learning for Zero-shot Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiawen Zhu</a>, <a href="/search/cs?searchtype=author&query=Ong%2C+Y">Yew-Soon Ong</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+G">Guansong Pang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10289v1-abstract-short" style="display: inline;"> Current zero-shot anomaly detection (ZSAD) methods show remarkable success in prompting large pre-trained vision-language models to detect anomalies in a target dataset without using any dataset-specific training or demonstration. However, these methods are often focused on crafting/learning prompts that capture only coarse-grained semantics of abnormality, e.g., high-level semantics like "damaged… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10289v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10289v1-abstract-full" style="display: none;"> Current zero-shot anomaly detection (ZSAD) methods show remarkable success in prompting large pre-trained vision-language models to detect anomalies in a target dataset without using any dataset-specific training or demonstration. However, these methods are often focused on crafting/learning prompts that capture only coarse-grained semantics of abnormality, e.g., high-level semantics like "damaged", "imperfect", or "defective" on carpet. They therefore have limited capability in recognizing diverse abnormality details with distinctive visual appearance, e.g., specific defect types like color stains, cuts, holes, and threads on carpet. To address this limitation, we propose FAPrompt, a novel framework designed to learn Fine-grained Abnormality Prompts for more accurate ZSAD. To this end, we introduce a novel compound abnormality prompting module in FAPrompt to learn a set of complementary, decomposed abnormality prompts, where each abnormality prompt is formed by a compound of shared normal tokens and a few learnable abnormal tokens. On the other hand, the fine-grained abnormality patterns can be very different from one dataset to another. To enhance their cross-dataset generalization, we further introduce a data-dependent abnormality prior module that learns to derive abnormality features from each query/test image as a sample-wise abnormality prior to ground the abnormality prompts in a given target dataset. Comprehensive experiments conducted across 19 real-world datasets, covering both industrial defects and medical anomalies, demonstrate that FAPrompt substantially outperforms state-of-the-art methods by at least 3%-5% AUC/AP in both image- and pixel-level ZSAD tasks. Code is available at https://github.com/mala-lab/FAPrompt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10289v1-abstract-full').style.display = 'none'; document.getElementById('2410.10289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 19 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09701">arXiv:2410.09701</a> <span> [<a href="https://arxiv.org/pdf/2410.09701">pdf</a>, <a href="https://arxiv.org/format/2410.09701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Transformers as Game Players: Provable In-context Game-playing Capabilities of Pre-trained Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+C">Chengshuai Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kun Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09701v1-abstract-short" style="display: inline;"> The in-context learning (ICL) capability of pre-trained models based on the transformer architecture has received growing interest in recent years. While theoretical understanding has been obtained for ICL in reinforcement learning (RL), the previous results are largely confined to the single-agent setting. This work proposes to further explore the in-context learning capabilities of pre-trained t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09701v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09701v1-abstract-full" style="display: none;"> The in-context learning (ICL) capability of pre-trained models based on the transformer architecture has received growing interest in recent years. While theoretical understanding has been obtained for ICL in reinforcement learning (RL), the previous results are largely confined to the single-agent setting. This work proposes to further explore the in-context learning capabilities of pre-trained transformer models in competitive multi-agent games, i.e., in-context game-playing (ICGP). Focusing on the classical two-player zero-sum games, theoretical guarantees are provided to demonstrate that pre-trained transformers can provably learn to approximate Nash equilibrium in an in-context manner for both decentralized and centralized learning settings. As a key part of the proof, constructional results are established to demonstrate that the transformer architecture is sufficiently rich to realize celebrated multi-agent game-playing algorithms, in particular, decentralized V-learning and centralized VI-ULCB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09701v1-abstract-full').style.display = 'none'; document.getElementById('2410.09701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09543">arXiv:2410.09543</a> <span> [<a href="https://arxiv.org/pdf/2410.09543">pdf</a>, <a href="https://arxiv.org/format/2410.09543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Boltzmann-Aligned Inverse Folding Model as a Predictor of Mutational Effects on Protein-Protein Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiao%2C+X">Xiaoran Jiao</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+W">Weian Mao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+W">Wengong Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+P">Peiyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09543v1-abstract-short" style="display: inline;"> Predicting the change in binding free energy ($螖螖G$) is crucial for understanding and modulating protein-protein interactions, which are critical in drug design. Due to the scarcity of experimental $螖螖G$ data, existing methods focus on pre-training, while neglecting the importance of alignment. In this work, we propose the Boltzmann Alignment technique to transfer knowledge from pre-trained invers… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09543v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09543v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09543v1-abstract-full" style="display: none;"> Predicting the change in binding free energy ($螖螖G$) is crucial for understanding and modulating protein-protein interactions, which are critical in drug design. Due to the scarcity of experimental $螖螖G$ data, existing methods focus on pre-training, while neglecting the importance of alignment. In this work, we propose the Boltzmann Alignment technique to transfer knowledge from pre-trained inverse folding models to $螖螖G$ prediction. We begin by analyzing the thermodynamic definition of $螖螖G$ and introducing the Boltzmann distribution to connect energy with protein conformational distribution. However, the protein conformational distribution is intractable; therefore, we employ Bayes' theorem to circumvent direct estimation and instead utilize the log-likelihood provided by protein inverse folding models for $螖螖G$ estimation. Compared to previous inverse folding-based methods, our method explicitly accounts for the unbound state of protein complex in the $螖螖G$ thermodynamic cycle, introducing a physical inductive bias and achieving both supervised and unsupervised state-of-the-art (SoTA) performance. Experimental results on SKEMPI v2 indicate that our method achieves Spearman coefficients of 0.3201 (unsupervised) and 0.5134 (supervised), significantly surpassing the previously reported SoTA values of 0.2632 and 0.4324, respectively. Futhermore, we demonstrate the capability of our method on binding energy prediction, protein-protein docking and antibody optimization tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09543v1-abstract-full').style.display = 'none'; document.getElementById('2410.09543v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04842">arXiv:2410.04842</a> <span> [<a href="https://arxiv.org/pdf/2410.04842">pdf</a>, <a href="https://arxiv.org/format/2410.04842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Simple Image Segmentation Framework via In-Context Examples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+C">Chenchen Jing</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hengtao Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Muzhi Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinlong Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04842v2-abstract-short" style="display: inline;"> Recently, there have been explorations of generalist segmentation models that can effectively tackle a variety of image segmentation tasks within a unified in-context learning framework. However, these methods still struggle with task ambiguity in in-context segmentation, as not all in-context examples can accurately convey the task information. In order to address this issue, we present SINE, a s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04842v2-abstract-full').style.display = 'inline'; document.getElementById('2410.04842v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04842v2-abstract-full" style="display: none;"> Recently, there have been explorations of generalist segmentation models that can effectively tackle a variety of image segmentation tasks within a unified in-context learning framework. However, these methods still struggle with task ambiguity in in-context segmentation, as not all in-context examples can accurately convey the task information. In order to address this issue, we present SINE, a simple image Segmentation framework utilizing in-context examples. Our approach leverages a Transformer encoder-decoder structure, where the encoder provides high-quality image representations, and the decoder is designed to yield multiple task-specific output masks to effectively eliminate task ambiguity. Specifically, we introduce an In-context Interaction module to complement in-context information and produce correlations between the target image and the in-context example and a Matching Transformer that uses fixed matching and a Hungarian algorithm to eliminate differences between different tasks. In addition, we have further perfected the current evaluation system for in-context image segmentation, aiming to facilitate a holistic appraisal of these models. Experiments on various segmentation tasks show the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04842v2-abstract-full').style.display = 'none'; document.getElementById('2410.04842v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Proc. Conference on Neural Information Processing Systems (NeurIPS) 2024. Webpage: https://github.com/aim-uofa/SINE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04765">arXiv:2410.04765</a> <span> [<a href="https://arxiv.org/pdf/2410.04765">pdf</a>, <a href="https://arxiv.org/format/2410.04765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Molecular topological deep learning for polymer property prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+F">Fei Han</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+K">Kelin Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04765v1-abstract-short" style="display: inline;"> Accurate and efficient prediction of polymer properties is of key importance for polymer design. Traditional experimental tools and density function theory (DFT)-based simulations for polymer property evaluation, are both expensive and time-consuming. Recently, a gigantic amount of graph-based molecular models have emerged and demonstrated huge potential in molecular data analysis. Even with the g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04765v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04765v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04765v1-abstract-full" style="display: none;"> Accurate and efficient prediction of polymer properties is of key importance for polymer design. Traditional experimental tools and density function theory (DFT)-based simulations for polymer property evaluation, are both expensive and time-consuming. Recently, a gigantic amount of graph-based molecular models have emerged and demonstrated huge potential in molecular data analysis. Even with the great progresses, these models tend to ignore the high-order and mutliscale information within the data. In this paper, we develop molecular topological deep learning (Mol-TDL) for polymer property analysis. Our Mol-TDL incorporates both high-order interactions and multiscale properties into topological deep learning architecture. The key idea is to represent polymer molecules as a series of simplicial complices at different scales and build up simplical neural networks accordingly. The aggregated information from different scales provides a more accurate prediction of polymer molecular properties. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04765v1-abstract-full').style.display = 'none'; document.getElementById('2410.04765v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03724">arXiv:2410.03724</a> <span> [<a href="https://arxiv.org/pdf/2410.03724">pdf</a>, <a href="https://arxiv.org/format/2410.03724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="General Economics">econ.GN</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models Overcome the Machine Penalty When Acting Fairly but Not When Acting Selfishly or Altruistically </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhen Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+R">Ruiqi Song</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shiya Yin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&query=Battu%2C+B">Balaraju Battu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lei Shi</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+D">Danyang Jia</a>, <a href="/search/cs?searchtype=author&query=Rahwan%2C+T">Talal Rahwan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyue Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03724v2-abstract-short" style="display: inline;"> In social dilemmas where the collective and self-interests are at odds, people typically cooperate less with machines than with fellow humans, a phenomenon termed the machine penalty. Overcoming this penalty is critical for successful human-machine collectives, yet current solutions often involve ethically-questionable tactics, like concealing machines' non-human nature. In this study, with 1,152… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03724v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03724v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03724v2-abstract-full" style="display: none;"> In social dilemmas where the collective and self-interests are at odds, people typically cooperate less with machines than with fellow humans, a phenomenon termed the machine penalty. Overcoming this penalty is critical for successful human-machine collectives, yet current solutions often involve ethically-questionable tactics, like concealing machines' non-human nature. In this study, with 1,152 participants, we explore the possibility of closing this research question by using Large Language Models (LLMs), in scenarios where communication is possible between interacting parties. We design three types of LLMs: (i) Cooperative, aiming to assist its human associate; (ii) Selfish, focusing solely on maximizing its self-interest; and (iii) Fair, balancing its own and collective interest, while slightly prioritizing self-interest. Our findings reveal that, when interacting with humans, fair LLMs are able to induce cooperation levels comparable to those observed in human-human interactions, even when their non-human nature is fully disclosed. In contrast, selfish and cooperative LLMs fail to achieve this goal. Post-experiment analysis shows that all three types of LLMs succeed in forming mutual cooperation agreements with humans, yet only fair LLMs, which occasionally break their promises, are capable of instilling a perception among humans that cooperating with them is the social norm, and eliciting positive views on their trustworthiness, mindfulness, intelligence, and communication quality. Our findings suggest that for effective human-machine cooperation, bot manufacturers should avoid designing machines with mere rational decision-making or a sole focus on assisting humans. Instead, they should design machines capable of judiciously balancing their own interest and the interest of humans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03724v2-abstract-full').style.display = 'none'; document.getElementById('2410.03724v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shen%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shen%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>