Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 376 results for author: <span class="mathjax">Xie, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xie%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xie, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xie%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xie, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xie%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xie%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xie%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xie%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xie%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xie%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08540">arXiv:2502.08540</a> <span> [<a href="https://arxiv.org/pdf/2502.08540">pdf</a>, <a href="https://arxiv.org/format/2502.08540">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Image Quality Assessment: Insights, Analysis, and Future Outlook </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chengqian Ma</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhengyi Shi</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhiqiang Lu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shenghao Xie</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+F">Fei Chao</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Y">Yao Sui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08540v1-abstract-short" style="display: inline;"> Image quality assessment (IQA) represents a pivotal challenge in image-focused technologies, significantly influencing the advancement trajectory of image processing and computer vision. Recently, IQA has witnessed a notable surge in innovative research efforts, driven by the emergence of novel architectural paradigms and sophisticated computational techniques. This survey delivers an extensive an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08540v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08540v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08540v1-abstract-full" style="display: none;"> Image quality assessment (IQA) represents a pivotal challenge in image-focused technologies, significantly influencing the advancement trajectory of image processing and computer vision. Recently, IQA has witnessed a notable surge in innovative research efforts, driven by the emergence of novel architectural paradigms and sophisticated computational techniques. This survey delivers an extensive analysis of contemporary IQA methodologies, organized according to their application scenarios, serving as a beneficial reference for both beginners and experienced researchers. We analyze the advantages and limitations of current approaches and suggest potential future research pathways. The survey encompasses both general and specific IQA methodologies, including conventional statistical measures, machine learning techniques, and cutting-edge deep learning models such as convolutional neural networks (CNNs) and Transformer models. The analysis within this survey highlights the necessity for distortion-specific IQA methods tailored to various application scenarios, emphasizing the significance of practicality, interpretability, and ease of implementation in future developments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08540v1-abstract-full').style.display = 'none'; document.getElementById('2502.08540v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07527">arXiv:2502.07527</a> <span> [<a href="https://arxiv.org/pdf/2502.07527">pdf</a>, <a href="https://arxiv.org/format/2502.07527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NatureLM: Deciphering the Language of Nature for Scientific Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yingce Xia</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Peiran Jin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shufang Xie</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chuan Cao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renqian Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guoqing Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zequn Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuan-Jyue Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zekun Guo</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yeqi Bai</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+P">Pan Deng</a>, <a href="/search/cs?searchtype=author&query=Min%2C+Y">Yaosen Min</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Ziheng Lu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+H">Hongxia Hao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Han Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jielan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jia Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kehan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kaiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+Q">Qizhi Pei</a> , et al. (20 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07527v1-abstract-short" style="display: inline;"> Foundation models have revolutionized natural language processing and artificial intelligence, significantly enhancing how machines comprehend and generate human languages. Inspired by the success of these foundation models, researchers have developed foundation models for individual scientific domains, including small molecules, materials, proteins, DNA, and RNA. However, these models are typical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07527v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07527v1-abstract-full" style="display: none;"> Foundation models have revolutionized natural language processing and artificial intelligence, significantly enhancing how machines comprehend and generate human languages. Inspired by the success of these foundation models, researchers have developed foundation models for individual scientific domains, including small molecules, materials, proteins, DNA, and RNA. However, these models are typically trained in isolation, lacking the ability to integrate across different scientific domains. Recognizing that entities within these domains can all be represented as sequences, which together form the "language of nature", we introduce Nature Language Model (briefly, NatureLM), a sequence-based science foundation model designed for scientific discovery. Pre-trained with data from multiple scientific domains, NatureLM offers a unified, versatile model that enables various applications including: (i) generating and optimizing small molecules, proteins, RNA, and materials using text instructions; (ii) cross-domain generation/design, such as protein-to-molecule and protein-to-RNA generation; and (iii) achieving state-of-the-art performance in tasks like SMILES-to-IUPAC translation and retrosynthesis on USPTO-50k. NatureLM offers a promising generalist approach for various scientific tasks, including drug discovery (hit generation/optimization, ADMET optimization, synthesis), novel material design, and the development of therapeutic proteins or nucleotides. We have developed NatureLM models in different sizes (1 billion, 8 billion, and 46.7 billion parameters) and observed a clear improvement in performance as the model size increases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07527v1-abstract-full').style.display = 'none'; document.getElementById('2502.07527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">81 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04482">arXiv:2502.04482</a> <span> [<a href="https://arxiv.org/pdf/2502.04482">pdf</a>, <a href="https://arxiv.org/format/2502.04482">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3714398">10.1145/3706598.3714398 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Gig2Gether: Data-sharing to Empower, Unify and Demystify Gig Work </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hsieh%2C+J">Jane Hsieh</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Angie Zhang</a>, <a href="/search/cs?searchtype=author&query=Surati%2C+S">Sajel Surati</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Sijia Xie</a>, <a href="/search/cs?searchtype=author&query=Ayala%2C+Y">Yeshua Ayala</a>, <a href="/search/cs?searchtype=author&query=Sathiya%2C+N">Nithila Sathiya</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+T">Tzu-Sheng Kuo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M+K">Min Kyung Lee</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Haiyi Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04482v1-abstract-short" style="display: inline;"> The wide adoption of platformized work has generated remarkable advancements in the labor patterns and mobility of modern society. Underpinning such progress, gig workers are exposed to unprecedented challenges and accountabilities: lack of data transparency, social and physical isolation, as well as insufficient infrastructural safeguards. Gig2Gether presents a space designed for workers to engag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04482v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04482v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04482v1-abstract-full" style="display: none;"> The wide adoption of platformized work has generated remarkable advancements in the labor patterns and mobility of modern society. Underpinning such progress, gig workers are exposed to unprecedented challenges and accountabilities: lack of data transparency, social and physical isolation, as well as insufficient infrastructural safeguards. Gig2Gether presents a space designed for workers to engage in an initial experience of voluntarily contributing anecdotal and statistical data to affect policy and build solidarity across platforms by exchanging unifying and diverse experiences. Our 7-day field study with 16 active workers from three distinct platforms and work domains showed existing affordances of data-sharing: facilitating mutual support across platforms, as well as enabling financial reflection and planning. Additionally, workers envisioned future use cases of data-sharing for collectivism (e.g., collaborative examinations of algorithmic speculations) and informing policy (e.g., around safety and pay), which motivated (latent) worker desiderata of additional capabilities and data metrics. Based on these findings, we discuss remaining challenges to address and how data-sharing tools can complement existing structures to maximize worker empowerment and policy impact. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04482v1-abstract-full').style.display = 'none'; document.getElementById('2502.04482v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02690">arXiv:2502.02690</a> <span> [<a href="https://arxiv.org/pdf/2502.02690">pdf</a>, <a href="https://arxiv.org/format/2502.02690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Controllable Video Generation with Provable Disentanglement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yifan Shen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+P">Peiyuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zijian Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shaoan Xie</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zeyu Tang</a>, <a href="/search/cs?searchtype=author&query=Deka%2C+N">Namrata Deka</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zongfang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02690v1-abstract-short" style="display: inline;"> Controllable video generation remains a significant challenge, despite recent advances in generating high-quality and consistent videos. Most existing methods for controlling video generation treat the video as a whole, neglecting intricate fine-grained spatiotemporal relationships, which limits both control precision and efficiency. In this paper, we propose Controllable Video Generative Adversar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02690v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02690v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02690v1-abstract-full" style="display: none;"> Controllable video generation remains a significant challenge, despite recent advances in generating high-quality and consistent videos. Most existing methods for controlling video generation treat the video as a whole, neglecting intricate fine-grained spatiotemporal relationships, which limits both control precision and efficiency. In this paper, we propose Controllable Video Generative Adversarial Networks (CoVoGAN) to disentangle the video concepts, thus facilitating efficient and independent control over individual concepts. Specifically, following the minimal change principle, we first disentangle static and dynamic latent variables. We then leverage the sufficient change property to achieve component-wise identifiability of dynamic latent variables, enabling independent control over motion and identity. To establish the theoretical foundation, we provide a rigorous analysis demonstrating the identifiability of our approach. Building on these theoretical insights, we design a Temporal Transition Module to disentangle latent dynamics. To enforce the minimal change principle and sufficient change property, we minimize the dimensionality of latent dynamic variables and impose temporal conditional independence. To validate our approach, we integrate this module as a plug-in for GANs. Extensive qualitative and quantitative experiments on various video generation benchmarks demonstrate that our method significantly improves generation quality and controllability across diverse real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02690v1-abstract-full').style.display = 'none'; document.getElementById('2502.02690v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01567">arXiv:2502.01567</a> <span> [<a href="https://arxiv.org/pdf/2502.01567">pdf</a>, <a href="https://arxiv.org/format/2502.01567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Scalable Language Models with Posterior Inference of Latent Thought Vectors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+D">Deqian Kong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Minglu Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dehong Xu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+B">Bo Pang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shu Wang</a>, <a href="/search/cs?searchtype=author&query=Honig%2C+E">Edouardo Honig</a>, <a href="/search/cs?searchtype=author&query=Si%2C+Z">Zhangzhang Si</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuan Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jianwen Xie</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Sirui Xie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y+N">Ying Nian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01567v1-abstract-short" style="display: inline;"> We propose a novel family of language models, Latent-Thought Language Models (LTMs), which incorporate explicit latent thought vectors that follow an explicit prior model in latent space. These latent thought vectors guide the autoregressive generation of ground tokens through a Transformer decoder. Training employs a dual-rate optimization process within the classical variational Bayes framework:… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01567v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01567v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01567v1-abstract-full" style="display: none;"> We propose a novel family of language models, Latent-Thought Language Models (LTMs), which incorporate explicit latent thought vectors that follow an explicit prior model in latent space. These latent thought vectors guide the autoregressive generation of ground tokens through a Transformer decoder. Training employs a dual-rate optimization process within the classical variational Bayes framework: fast learning of local variational parameters for the posterior distribution of latent vectors, and slow learning of global decoder parameters. Empirical studies reveal that LTMs possess additional scaling dimensions beyond traditional LLMs, yielding a structured design space. Higher sample efficiency can be achieved by increasing training compute per token, with further gains possible by trading model size for more inference steps. Designed based on these scaling properties, LTMs demonstrate superior sample and parameter efficiency compared to conventional autoregressive models and discrete diffusion models. They significantly outperform these counterparts in validation perplexity and zero-shot language modeling. Additionally, LTMs exhibit emergent few-shot in-context reasoning capabilities that scale with model and latent size, and achieve competitive performance in conditional and unconditional text generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01567v1-abstract-full').style.display = 'none'; document.getElementById('2502.01567v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17161">arXiv:2501.17161</a> <span> [<a href="https://arxiv.org/pdf/2501.17161">pdf</a>, <a href="https://arxiv.org/format/2501.17161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+T">Tianzhe Chu</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yuexiang Zhai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jihan Yang</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+S">Shengbang Tong</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a>, <a href="/search/cs?searchtype=author&query=Schuurmans%2C+D">Dale Schuurmans</a>, <a href="/search/cs?searchtype=author&query=Le%2C+Q+V">Quoc V. Le</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yi Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17161v1-abstract-short" style="display: inline;"> Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used post-training techniques for foundation models. However, their roles in enhancing model generalization capabilities remain unclear. This paper studies the difference between SFT and RL on generalization and memorization, focusing on text-based rule variants and visual variants. We introduce GeneralPoints, an arithmetic re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17161v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17161v1-abstract-full" style="display: none;"> Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used post-training techniques for foundation models. However, their roles in enhancing model generalization capabilities remain unclear. This paper studies the difference between SFT and RL on generalization and memorization, focusing on text-based rule variants and visual variants. We introduce GeneralPoints, an arithmetic reasoning card game, and adopt V-IRL, a real-world navigation environment, to assess how models trained with SFT and RL generalize to unseen variants in both textual and visual domains. We show that RL, especially when trained with an outcome-based reward, generalizes across both rule-based textual and visual variants. SFT, in contrast, tends to memorize training data and struggles to generalize out-of-distribution scenarios. Further analysis reveals that RL improves the model's underlying visual recognition capabilities, contributing to its enhanced generalization in the visual domain. Despite RL's superior generalization, we show that SFT remains essential for effective RL training; SFT stabilizes the model's output format, enabling subsequent RL to achieve its performance gains. These findings demonstrates the capability of RL for acquiring generalizable knowledge in complex, multi-modal tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17161v1-abstract-full').style.display = 'none'; document.getElementById('2501.17161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website at https://tianzhechu.com/SFTvsRL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16164">arXiv:2501.16164</a> <span> [<a href="https://arxiv.org/pdf/2501.16164">pdf</a>, <a href="https://arxiv.org/format/2501.16164">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> MetaDecorator: Generating Immersive Virtual Tours through Multimodality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shuang Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J+S+A">Jeannie S. A. Lee</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Haiwei Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16164v1-abstract-short" style="display: inline;"> MetaDecorator, is a framework that empowers users to personalize virtual spaces. By leveraging text-driven prompts and image synthesis techniques, MetaDecorator adorns static panoramas captured by 360掳 imaging devices, transforming them into uniquely styled and visually appealing environments. This significantly enhances the realism and engagement of virtual tours compared to traditional offerings… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16164v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16164v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16164v1-abstract-full" style="display: none;"> MetaDecorator, is a framework that empowers users to personalize virtual spaces. By leveraging text-driven prompts and image synthesis techniques, MetaDecorator adorns static panoramas captured by 360掳 imaging devices, transforming them into uniquely styled and visually appealing environments. This significantly enhances the realism and engagement of virtual tours compared to traditional offerings. Beyond the core framework, we also discuss the integration of Large Language Models (LLMs) and haptics in the VR application to provide a more immersive experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16164v1-abstract-full').style.display = 'none'; document.getElementById('2501.16164v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14732">arXiv:2501.14732</a> <span> [<a href="https://arxiv.org/pdf/2501.14732">pdf</a>, <a href="https://arxiv.org/format/2501.14732">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Orthrus: Accelerating Multi-BFT Consensus through Concurrent Partial Ordering of Transactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Hanzheng Lyu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shaokang Xie</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Jianyu Niu</a>, <a href="/search/cs?searchtype=author&query=Beschastnikh%2C+I">Ivan Beschastnikh</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinqian Zhang</a>, <a href="/search/cs?searchtype=author&query=Sadoghi%2C+M">Mohammad Sadoghi</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14732v1-abstract-short" style="display: inline;"> Multi-Byzantine Fault Tolerant (Multi-BFT) consensus allows multiple consensus instances to run in parallel, resolving the leader bottleneck problem inherent in classic BFT consensus. However, the global ordering of Multi-BFT consensus enforces a strict serialized sequence of transactions, imposing additional confirmation latency and also limiting concurrency. In this paper, we introduce Orthrus,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14732v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14732v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14732v1-abstract-full" style="display: none;"> Multi-Byzantine Fault Tolerant (Multi-BFT) consensus allows multiple consensus instances to run in parallel, resolving the leader bottleneck problem inherent in classic BFT consensus. However, the global ordering of Multi-BFT consensus enforces a strict serialized sequence of transactions, imposing additional confirmation latency and also limiting concurrency. In this paper, we introduce Orthrus, a Multi-BFT protocol that accelerates transaction confirmation through partial ordering while reserving global ordering for transactions requiring stricter sequencing. To this end, Orthrus strategically partitions transactions to maximize concurrency and ensure consistency. Additionally, it incorporates an escrow mechanism to manage interactions between partially and globally ordered transactions. We evaluated Orthrus through extensive experiments in realistic settings, deploying 128 replicas in WAN and LAN environments. Our findings demonstrate latency reductions of up to 87% in WAN compared to existing Multi-BFT protocols. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14732v1-abstract-full').style.display = 'none'; document.getElementById('2501.14732v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13430">arXiv:2501.13430</a> <span> [<a href="https://arxiv.org/pdf/2501.13430">pdf</a>, <a href="https://arxiv.org/format/2501.13430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Wasserstein-regularized Conformal Prediction under General Distribution Shift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+R">Rui Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chao Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yue Sun</a>, <a href="/search/cs?searchtype=author&query=Venkitasubramaniam%2C+P">Parvathinathan Venkitasubramaniam</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Sihong Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13430v1-abstract-short" style="display: inline;"> Conformal prediction yields a prediction set with guaranteed $1-伪$ coverage of the true target under the i.i.d. assumption, which may not hold and lead to a gap between $1-伪$ and the actual coverage. Prior studies bound the gap using total variation distance, which cannot identify the gap changes under distribution shift at a given $伪$. Besides, existing methods are mostly limited to covariate shi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13430v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13430v1-abstract-full" style="display: none;"> Conformal prediction yields a prediction set with guaranteed $1-伪$ coverage of the true target under the i.i.d. assumption, which may not hold and lead to a gap between $1-伪$ and the actual coverage. Prior studies bound the gap using total variation distance, which cannot identify the gap changes under distribution shift at a given $伪$. Besides, existing methods are mostly limited to covariate shift,while general joint distribution shifts are more common in practice but less researched.In response, we first propose a Wasserstein distance-based upper bound of the coverage gap and analyze the bound using probability measure pushforwards between the shifted joint data and conformal score distributions, enabling a separation of the effect of covariate and concept shifts over the coverage gap. We exploit the separation to design an algorithm based on importance weighting and regularized representation learning (WR-CP) to reduce the Wasserstein bound with a finite-sample error bound.WR-CP achieves a controllable balance between conformal prediction accuracy and efficiency. Experiments on six datasets prove that WR-CP can reduce coverage gaps to $3.1\%$ across different confidence levels and outputs prediction sets 38$\%$ smaller than the worst-case approach on average. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13430v1-abstract-full').style.display = 'none'; document.getElementById('2501.13430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10784">arXiv:2501.10784</a> <span> [<a href="https://arxiv.org/pdf/2501.10784">pdf</a>, <a href="https://arxiv.org/format/2501.10784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Measuring Fairness in Financial Transaction Machine Learning Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ayvaz%2C+D+S">Deniz Sezin Ayvaz</a>, <a href="/search/cs?searchtype=author&query=Belenguer%2C+L">Lorenzo Belenguer</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Hankun He</a>, <a href="/search/cs?searchtype=author&query=Kanubala%2C+D+D">Deborah Dormah Kanubala</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingxu Li</a>, <a href="/search/cs?searchtype=author&query=Low%2C+S">Soung Low</a>, <a href="/search/cs?searchtype=author&query=Mougan%2C+C">Carlos Mougan</a>, <a href="/search/cs?searchtype=author&query=Onwuegbuche%2C+F+C">Faithful Chiagoziem Onwuegbuche</a>, <a href="/search/cs?searchtype=author&query=Pi%2C+Y">Yulu Pi</a>, <a href="/search/cs?searchtype=author&query=Sikora%2C+N">Natalia Sikora</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+D">Dan Tran</a>, <a href="/search/cs?searchtype=author&query=Verma%2C+S">Shresth Verma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Skyler Xie</a>, <a href="/search/cs?searchtype=author&query=Pelletier%2C+A">Adeline Pelletier</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10784v2-abstract-short" style="display: inline;"> Mastercard, a global leader in financial services, develops and deploys machine learning models aimed at optimizing card usage and preventing attrition through advanced predictive models. These models use aggregated and anonymized card usage patterns, including cross-border transactions and industry-specific spending, to tailor bank offerings and maximize revenue opportunities. Mastercard has esta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10784v2-abstract-full').style.display = 'inline'; document.getElementById('2501.10784v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10784v2-abstract-full" style="display: none;"> Mastercard, a global leader in financial services, develops and deploys machine learning models aimed at optimizing card usage and preventing attrition through advanced predictive models. These models use aggregated and anonymized card usage patterns, including cross-border transactions and industry-specific spending, to tailor bank offerings and maximize revenue opportunities. Mastercard has established an AI Governance program, based on its Data and Tech Responsibility Principles, to evaluate any built and bought AI for efficacy, fairness, and transparency. As part of this effort, Mastercard has sought expertise from the Turing Institute through a Data Study Group to better assess fairness in more complex AI/ML models. The Data Study Group challenge lies in defining, measuring, and mitigating fairness in these predictions, which can be complex due to the various interpretations of fairness, gaps in the research literature, and ML-operations challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10784v2-abstract-full').style.display = 'none'; document.getElementById('2501.10784v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Mastercard Data Study Group Alan Turing Institute: https://www.turing.ac.uk/news/publications/data-study-group-final-report-mastercard</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09732">arXiv:2501.09732</a> <span> [<a href="https://arxiv.org/pdf/2501.09732">pdf</a>, <a href="https://arxiv.org/format/2501.09732">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+N">Nanye Ma</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+S">Shangyuan Tong</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+H">Haolin Jia</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu-Chuan Su</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingda Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuan Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yandong Li</a>, <a href="/search/cs?searchtype=author&query=Jaakkola%2C+T">Tommi Jaakkola</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xuhui Jia</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09732v1-abstract-short" style="display: inline;"> Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09732v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09732v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09732v1-abstract-full" style="display: none;"> Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09732v1-abstract-full').style.display = 'none'; document.getElementById('2501.09732v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04268">arXiv:2501.04268</a> <span> [<a href="https://arxiv.org/pdf/2501.04268">pdf</a>, <a href="https://arxiv.org/format/2501.04268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robotic Programmer: Video Instructed Policy Code Generation for Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Senwei Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhanqi Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruiping Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xilin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04268v1-abstract-short" style="display: inline;"> Zero-shot generalization across various robots, tasks and environments remains a significant challenge in robotic manipulation. Policy code generation methods use executable code to connect high-level task descriptions and low-level action sequences, leveraging the generalization capabilities of large language models and atomic skill libraries. In this work, we propose Robotic Programmer (RoboPro)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04268v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04268v1-abstract-full" style="display: none;"> Zero-shot generalization across various robots, tasks and environments remains a significant challenge in robotic manipulation. Policy code generation methods use executable code to connect high-level task descriptions and low-level action sequences, leveraging the generalization capabilities of large language models and atomic skill libraries. In this work, we propose Robotic Programmer (RoboPro), a robotic foundation model, enabling the capability of perceiving visual information and following free-form instructions to perform robotic manipulation with policy code in a zero-shot manner. To address low efficiency and high cost in collecting runtime code data for robotic tasks, we devise Video2Code to synthesize executable code from extensive videos in-the-wild with off-the-shelf vision-language model and code-domain large language model. Extensive experiments show that RoboPro achieves the state-of-the-art zero-shot performance on robotic manipulation in both simulators and real-world environments. Specifically, the zero-shot success rate of RoboPro on RLBench surpasses the state-of-the-art model GPT-4o by 11.6%, which is even comparable to a strong supervised training baseline. Furthermore, RoboPro is robust to variations on API formats and skill sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04268v1-abstract-full').style.display = 'none'; document.getElementById('2501.04268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04003">arXiv:2501.04003</a> <span> [<a href="https://arxiv.org/pdf/2501.04003">pdf</a>, <a href="https://arxiv.org/format/2501.04003">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Are VLMs Ready for Autonomous Driving? An Empirical Study from the Reliability, Data, and Metric Perspectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shaoyuan Xie</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingdong Kong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuhao Dong</a>, <a href="/search/cs?searchtype=author&query=Sima%2C+C">Chonghao Sima</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q+A">Qi Alfred Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Liang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04003v1-abstract-short" style="display: inline;"> Recent advancements in Vision-Language Models (VLMs) have sparked interest in their use for autonomous driving, particularly in generating interpretable driving decisions through natural language. However, the assumption that VLMs inherently provide visually grounded, reliable, and interpretable explanations for driving remains largely unexamined. To address this gap, we introduce DriveBench, a be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04003v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04003v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04003v1-abstract-full" style="display: none;"> Recent advancements in Vision-Language Models (VLMs) have sparked interest in their use for autonomous driving, particularly in generating interpretable driving decisions through natural language. However, the assumption that VLMs inherently provide visually grounded, reliable, and interpretable explanations for driving remains largely unexamined. To address this gap, we introduce DriveBench, a benchmark dataset designed to evaluate VLM reliability across 17 settings (clean, corrupted, and text-only inputs), encompassing 19,200 frames, 20,498 question-answer pairs, three question types, four mainstream driving tasks, and a total of 12 popular VLMs. Our findings reveal that VLMs often generate plausible responses derived from general knowledge or textual cues rather than true visual grounding, especially under degraded or missing visual inputs. This behavior, concealed by dataset imbalances and insufficient evaluation metrics, poses significant risks in safety-critical scenarios like autonomous driving. We further observe that VLMs struggle with multi-modal reasoning and display heightened sensitivity to input corruptions, leading to inconsistencies in performance. To address these challenges, we propose refined evaluation metrics that prioritize robust visual grounding and multi-modal understanding. Additionally, we highlight the potential of leveraging VLMs' awareness of corruptions to enhance their reliability, offering a roadmap for developing more trustworthy and interpretable decision-making systems in real-world autonomous driving contexts. The benchmark toolkit is publicly accessible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04003v1-abstract-full').style.display = 'none'; document.getElementById('2501.04003v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint; 41 pages, 32 figures, 16 tables; Project Page at https://drive-bench.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01062">arXiv:2501.01062</a> <span> [<a href="https://arxiv.org/pdf/2501.01062">pdf</a>, <a href="https://arxiv.org/format/2501.01062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Fides: Scalable Censorship-Resistant DAG Consensus via Trusted Components </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shaokang Xie</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+D">Dakai Kang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Hanzheng Lyu</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Jianyu Niu</a>, <a href="/search/cs?searchtype=author&query=Sadoghi%2C+M">Mohammad Sadoghi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01062v1-abstract-short" style="display: inline;"> Recently, consensus protocols based on Directed Acyclic Graph (DAG) have gained significant attention due to their potential to build robust blockchain systems, particularly in asynchronous networks. In this paper, we propose Fides, an asynchronous DAG-based BFT consensus protocol that leverages Trusted Execution Environments (TEEs) to tackle three major scalability and security challenges faced b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01062v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01062v1-abstract-full" style="display: none;"> Recently, consensus protocols based on Directed Acyclic Graph (DAG) have gained significant attention due to their potential to build robust blockchain systems, particularly in asynchronous networks. In this paper, we propose Fides, an asynchronous DAG-based BFT consensus protocol that leverages Trusted Execution Environments (TEEs) to tackle three major scalability and security challenges faced by existing protocols: (i) the need for a larger quorum size (i.e., at least 3x larger) to tolerate Byzantine replicas, (ii) high communication costs and reliance on expensive cryptographic primitives (i.e., global common coin) to reach agreement in asynchronous networks, and (iii) poor censorship resilience undermining the liveness guarantee. Specifically, Fides adopts four trusted components-Reliable Broadcast, Vertex Validation, Common Coin, and Transaction Disclosure-within TEEs. Incorporating these components enables Fides to achieve linear message complexity, guaranteed censorship resilience, 2x larger quorum size, and lightweight common coin usage. Besides, abstracting these essential components rather than porting the entire protocol into TEE can significantly reduce the Trusted Computing Base (TCB). Experimental evaluations of Fides in local and geo-distributed networks demonstrate its superior performance compared to established state-of-the-art protocols such as Tusk, RCC, HotStuff, and PBFT. The results indicate that Fides achieves a throughput of 400k transactions per second in a geo-distributed network and 810k transactions per second in a local network. Our analysis further explores the protocol's overhead, highlighting its suitability and effectiveness for practical deployment in real-world blockchain systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01062v1-abstract-full').style.display = 'none'; document.getElementById('2501.01062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00379">arXiv:2501.00379</a> <span> [<a href="https://arxiv.org/pdf/2501.00379">pdf</a>, <a href="https://arxiv.org/format/2501.00379">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Federated Dropout: Convergence Analysis and Resource Allocation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Sijing Xie</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+D">Dingzhu Wen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaonan Liu</a>, <a href="/search/cs?searchtype=author&query=You%2C+C">Changsheng You</a>, <a href="/search/cs?searchtype=author&query=Ratnarajah%2C+T">Tharmalingam Ratnarajah</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaibin Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00379v1-abstract-short" style="display: inline;"> Federated Dropout is an efficient technique to overcome both communication and computation bottlenecks for deploying federated learning at the network edge. In each training round, an edge device only needs to update and transmit a sub-model, which is generated by the typical method of dropout in deep learning, and thus effectively reduces the per-round latency. \textcolor{blue}{However, the theor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00379v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00379v1-abstract-full" style="display: none;"> Federated Dropout is an efficient technique to overcome both communication and computation bottlenecks for deploying federated learning at the network edge. In each training round, an edge device only needs to update and transmit a sub-model, which is generated by the typical method of dropout in deep learning, and thus effectively reduces the per-round latency. \textcolor{blue}{However, the theoretical convergence analysis for Federated Dropout is still lacking in the literature, particularly regarding the quantitative influence of dropout rate on convergence}. To address this issue, by using the Taylor expansion method, we mathematically show that the gradient variance increases with a scaling factor of $纬/(1-纬)$, with $纬\in [0, 胃)$ denoting the dropout rate and $胃$ being the maximum dropout rate ensuring the loss function reduction. Based on the above approximation, we provide the convergence analysis for Federated Dropout. Specifically, it is shown that a larger dropout rate of each device leads to a slower convergence rate. This provides a theoretical foundation for reducing the convergence latency by making a tradeoff between the per-round latency and the overall rounds till convergence. Moreover, a low-complexity algorithm is proposed to jointly optimize the dropout rate and the bandwidth allocation for minimizing the loss function in all rounds under a given per-round latency and limited network resources. Finally, numerical results are provided to verify the effectiveness of the proposed algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00379v1-abstract-full').style.display = 'none'; document.getElementById('2501.00379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00064">arXiv:2501.00064</a> <span> [<a href="https://arxiv.org/pdf/2501.00064">pdf</a>, <a href="https://arxiv.org/format/2501.00064">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Lungmix: A Mixup-Based Strategy for Generalization in Respiratory Sound Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+S">Shijia Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weixiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shuzhao Xie</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+B">Baixu Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00064v1-abstract-short" style="display: inline;"> Respiratory sound classification plays a pivotal role in diagnosing respiratory diseases. While deep learning models have shown success with various respiratory sound datasets, our experiments indicate that models trained on one dataset often fail to generalize effectively to others, mainly due to data collection and annotation \emph{inconsistencies}. To address this limitation, we introduce \emph… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00064v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00064v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00064v1-abstract-full" style="display: none;"> Respiratory sound classification plays a pivotal role in diagnosing respiratory diseases. While deep learning models have shown success with various respiratory sound datasets, our experiments indicate that models trained on one dataset often fail to generalize effectively to others, mainly due to data collection and annotation \emph{inconsistencies}. To address this limitation, we introduce \emph{Lungmix}, a novel data augmentation technique inspired by Mixup. Lungmix generates augmented data by blending waveforms using loudness and random masks while interpolating labels based on their semantic meaning, helping the model learn more generalized representations. Comprehensive evaluations across three datasets, namely ICBHI, SPR, and HF, demonstrate that Lungmix significantly enhances model generalization to unseen data. In particular, Lungmix boosts the 4-class classification score by up to 3.55\%, achieving performance comparable to models trained directly on the target dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00064v1-abstract-full').style.display = 'none'; document.getElementById('2501.00064v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4pages, 3 figures, conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15244">arXiv:2412.15244</a> <span> [<a href="https://arxiv.org/pdf/2412.15244">pdf</a>, <a href="https://arxiv.org/format/2412.15244">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MPPO: Multi Pair-wise Preference Optimization for LLMs with Arbitrary Negative Samples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shuo Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+F">Fangzhi Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahui Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+L">Lulu Wen</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+W">Wei Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaowei Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Junxiong Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kai Zhou</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15244v1-abstract-short" style="display: inline;"> Aligning Large Language Models (LLMs) with human feedback is crucial for their development. Existing preference optimization methods such as DPO and KTO, while improved based on Reinforcement Learning from Human Feedback (RLHF), are inherently derived from PPO, requiring a reference model that adds GPU memory resources and relies heavily on abundant preference data. Meanwhile, current preference o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15244v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15244v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15244v1-abstract-full" style="display: none;"> Aligning Large Language Models (LLMs) with human feedback is crucial for their development. Existing preference optimization methods such as DPO and KTO, while improved based on Reinforcement Learning from Human Feedback (RLHF), are inherently derived from PPO, requiring a reference model that adds GPU memory resources and relies heavily on abundant preference data. Meanwhile, current preference optimization research mainly targets single-question scenarios with two replies, neglecting optimization with multiple replies, which leads to a waste of data in the application. This study introduces the MPPO algorithm, which leverages the average likelihood of model responses to fit the reward function and maximizes the utilization of preference data. Through a comparison of Point-wise, Pair-wise, and List-wise implementations, we found that the Pair-wise approach achieves the best performance, significantly enhancing the quality of model responses. Experimental results demonstrate MPPO's outstanding performance across various benchmarks. On MT-Bench, MPPO outperforms DPO, ORPO, and SimPO. Notably, on Arena-Hard, MPPO surpasses DPO and ORPO by substantial margins. These achievements underscore the remarkable advantages of MPPO in preference optimization tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15244v1-abstract-full').style.display = 'none'; document.getElementById('2412.15244v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by COLING2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14171">arXiv:2412.14171</a> <span> [<a href="https://arxiv.org/pdf/2412.14171">pdf</a>, <a href="https://arxiv.org/format/2412.14171">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Thinking in Space: How Multimodal Large Language Models See, Remember, and Recall Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jihan Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shusheng Yang</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A+W">Anjali W. Gupta</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Rilyn Han</a>, <a href="/search/cs?searchtype=author&query=Fei-Fei%2C+L">Li Fei-Fei</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14171v1-abstract-short" style="display: inline;"> Humans possess the visual-spatial intelligence to remember spaces from sequential visual observations. However, can Multimodal Large Language Models (MLLMs) trained on million-scale video datasets also ``think in space'' from videos? We present a novel video-based visual-spatial intelligence benchmark (VSI-Bench) of over 5,000 question-answer pairs, and find that MLLMs exhibit competitive - though… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14171v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14171v1-abstract-full" style="display: none;"> Humans possess the visual-spatial intelligence to remember spaces from sequential visual observations. However, can Multimodal Large Language Models (MLLMs) trained on million-scale video datasets also ``think in space'' from videos? We present a novel video-based visual-spatial intelligence benchmark (VSI-Bench) of over 5,000 question-answer pairs, and find that MLLMs exhibit competitive - though subhuman - visual-spatial intelligence. We probe models to express how they think in space both linguistically and visually and find that while spatial reasoning capabilities remain the primary bottleneck for MLLMs to reach higher benchmark performance, local world models and spatial awareness do emerge within these models. Notably, prevailing linguistic reasoning techniques (e.g., chain-of-thought, self-consistency, tree-of-thoughts) fail to improve performance, whereas explicitly generating cognitive maps during question-answering enhances MLLMs' spatial distance ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14171v1-abstract-full').style.display = 'none'; document.getElementById('2412.14171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://vision-x-nyu.github.io/thinking-in-space.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14164">arXiv:2412.14164</a> <span> [<a href="https://arxiv.org/pdf/2412.14164">pdf</a>, <a href="https://arxiv.org/format/2412.14164">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MetaMorph: Multimodal Understanding and Generation via Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tong%2C+S">Shengbang Tong</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">David Fan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiachen Zhu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yunyang Xiong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinlei Chen</a>, <a href="/search/cs?searchtype=author&query=Sinha%2C+K">Koustuv Sinha</a>, <a href="/search/cs?searchtype=author&query=Rabbat%2C+M">Michael Rabbat</a>, <a href="/search/cs?searchtype=author&query=LeCun%2C+Y">Yann LeCun</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhuang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14164v1-abstract-short" style="display: inline;"> In this work, we propose Visual-Predictive Instruction Tuning (VPiT) - a simple and effective extension to visual instruction tuning that enables a pretrained LLM to quickly morph into an unified autoregressive model capable of generating both text and visual tokens. VPiT teaches an LLM to predict discrete text tokens and continuous visual tokens from any input sequence of image and text data cura… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14164v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14164v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14164v1-abstract-full" style="display: none;"> In this work, we propose Visual-Predictive Instruction Tuning (VPiT) - a simple and effective extension to visual instruction tuning that enables a pretrained LLM to quickly morph into an unified autoregressive model capable of generating both text and visual tokens. VPiT teaches an LLM to predict discrete text tokens and continuous visual tokens from any input sequence of image and text data curated in an instruction-following format. Our empirical investigation reveals several intriguing properties of VPiT: (1) visual generation ability emerges as a natural byproduct of improved visual understanding, and can be unlocked efficiently with a small amount of generation data; (2) while we find understanding and generation to be mutually beneficial, understanding data contributes to both capabilities more effectively than generation data. Building upon these findings, we train our MetaMorph model and achieve competitive performance on both visual understanding and generation. In visual generation, MetaMorph can leverage the world knowledge and reasoning abilities gained from LLM pretraining, and overcome common failure modes exhibited by other generation models. Our results suggest that LLMs may have strong "prior" vision capabilities that can be efficiently adapted to both visual understanding and generation with a relatively simple instruction tuning process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14164v1-abstract-full').style.display = 'none'; document.getElementById('2412.14164v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page at tsb0601.github.io/metamorph</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13552">arXiv:2412.13552</a> <span> [<a href="https://arxiv.org/pdf/2412.13552">pdf</a>, <a href="https://arxiv.org/format/2412.13552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> DragScene: Interactive 3D Scene Editing with Single-view Drag Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+C">Chenghao Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenzhe Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhengqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yunpeng Bai</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shuzhao Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13552v1-abstract-short" style="display: inline;"> 3D editing has shown remarkable capability in editing scenes based on various instructions. However, existing methods struggle with achieving intuitive, localized editing, such as selectively making flowers blossom. Drag-style editing has shown exceptional capability to edit images with direct manipulation instead of ambiguous text commands. Nevertheless, extending drag-based editing to 3D scenes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13552v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13552v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13552v1-abstract-full" style="display: none;"> 3D editing has shown remarkable capability in editing scenes based on various instructions. However, existing methods struggle with achieving intuitive, localized editing, such as selectively making flowers blossom. Drag-style editing has shown exceptional capability to edit images with direct manipulation instead of ambiguous text commands. Nevertheless, extending drag-based editing to 3D scenes presents substantial challenges due to multi-view inconsistency. To this end, we introduce DragScene, a framework that integrates drag-style editing with diverse 3D representations. First, latent optimization is performed on a reference view to generate 2D edits based on user instructions. Subsequently, coarse 3D clues are reconstructed from the reference view using a point-based representation to capture the geometric details of the edits. The latent representation of the edited view is then mapped to these 3D clues, guiding the latent optimization of other views. This process ensures that edits are propagated seamlessly across multiple views, maintaining multi-view consistency. Finally, the target 3D scene is reconstructed from the edited multi-view images. Extensive experiments demonstrate that DragScene facilitates precise and flexible drag-style editing of 3D scenes, supporting broad applicability across diverse 3D representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13552v1-abstract-full').style.display = 'none'; document.getElementById('2412.13552v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10153">arXiv:2412.10153</a> <span> [<a href="https://arxiv.org/pdf/2412.10153">pdf</a>, <a href="https://arxiv.org/format/2412.10153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> EVOS: Efficient Implicit Neural Training via EVOlutionary Selector </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weixiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shuzhao Xie</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+C">Chengwei Ren</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Siyi Xie</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+S">Shijia Ge</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingzi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10153v1-abstract-short" style="display: inline;"> We propose EVOlutionary Selector (EVOS), an efficient training paradigm for accelerating Implicit Neural Representation (INR). Unlike conventional INR training that feeds all samples through the neural network in each iteration, our approach restricts training to strategically selected points, reducing computational overhead by eliminating redundant forward passes. Specifically, we treat each samp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10153v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10153v1-abstract-full" style="display: none;"> We propose EVOlutionary Selector (EVOS), an efficient training paradigm for accelerating Implicit Neural Representation (INR). Unlike conventional INR training that feeds all samples through the neural network in each iteration, our approach restricts training to strategically selected points, reducing computational overhead by eliminating redundant forward passes. Specifically, we treat each sample as an individual in an evolutionary process, where only those fittest ones survive and merit inclusion in training, adaptively evolving with the neural network dynamics. While this is conceptually similar to Evolutionary Algorithms, their distinct objectives (selection for acceleration vs. iterative solution optimization) require a fundamental redefinition of evolutionary mechanisms for our context. In response, we design sparse fitness evaluation, frequency-guided crossover, and augmented unbiased mutation to comprise EVOS. These components respectively guide sample selection with reduced computational cost, enhance performance through frequency-domain balance, and mitigate selection bias from cached evaluation. Extensive experiments demonstrate that our method achieves approximately 48%-66% reduction in training time while ensuring superior convergence without additional cost, establishing state-of-the-art acceleration among recent sampling-based strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10153v1-abstract-full').style.display = 'none'; document.getElementById('2412.10153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09213">arXiv:2412.09213</a> <span> [<a href="https://arxiv.org/pdf/2412.09213">pdf</a>, <a href="https://arxiv.org/format/2412.09213">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Implicit Neural Representations via Symmetric Power Transformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weixiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shuzhao Xie</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+C">Chengwei Ren</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+S">Shijia Ge</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingzi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09213v1-abstract-short" style="display: inline;"> We propose symmetric power transformation to enhance the capacity of Implicit Neural Representation~(INR) from the perspective of data transformation. Unlike prior work utilizing random permutation or index rearrangement, our method features a reversible operation that does not require additional storage consumption. Specifically, we first investigate the characteristics of data that can benefit t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09213v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09213v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09213v1-abstract-full" style="display: none;"> We propose symmetric power transformation to enhance the capacity of Implicit Neural Representation~(INR) from the perspective of data transformation. Unlike prior work utilizing random permutation or index rearrangement, our method features a reversible operation that does not require additional storage consumption. Specifically, we first investigate the characteristics of data that can benefit the training of INR, proposing the Range-Defined Symmetric Hypothesis, which posits that specific range and symmetry can improve the expressive ability of INR. Based on this hypothesis, we propose a nonlinear symmetric power transformation to achieve both range-defined and symmetric properties simultaneously. We use the power coefficient to redistribute data to approximate symmetry within the target range. To improve the robustness of the transformation, we further design deviation-aware calibration and adaptive soft boundary to address issues of extreme deviation boosting and continuity breaking. Extensive experiments are conducted to verify the performance of the proposed method, demonstrating that our transformation can reliably improve INR compared with other data transformations. We also conduct 1D audio, 2D image and 3D video fitting tasks to demonstrate the effectiveness and applicability of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09213v1-abstract-full').style.display = 'none'; document.getElementById('2412.09213v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07778">arXiv:2412.07778</a> <span> [<a href="https://arxiv.org/pdf/2412.07778">pdf</a>, <a href="https://arxiv.org/format/2412.07778">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MIN: Multi-channel Interaction Network for Drug-Target Interaction with Protein Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuqi Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shufang Xie</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hongda Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuhan Chen</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+T">Tao Qin</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+T">Tianjun Ke</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07778v1-abstract-short" style="display: inline;"> Traditional drug discovery processes are both time-consuming and require extensive professional expertise. With the accumulation of drug-target interaction (DTI) data from experimental studies, leveraging modern machine-learning techniques to discern patterns between drugs and target proteins has become increasingly feasible. In this paper, we introduce the Multi-channel Interaction Network (MIN),… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07778v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07778v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07778v1-abstract-full" style="display: none;"> Traditional drug discovery processes are both time-consuming and require extensive professional expertise. With the accumulation of drug-target interaction (DTI) data from experimental studies, leveraging modern machine-learning techniques to discern patterns between drugs and target proteins has become increasingly feasible. In this paper, we introduce the Multi-channel Interaction Network (MIN), a novel framework designed to predict DTIs through two primary components: a representation learning module and a multi-channel interaction module. The representation learning module features a C-Score Predictor-assisted screening mechanism, which selects critical residues to enhance prediction accuracy and reduce noise. The multi-channel interaction module incorporates a structure-agnostic channel, a structure-aware channel, and an extended-mixture channel, facilitating the identification of interaction patterns at various levels for optimal complementarity. Additionally, contrastive learning is utilized to harmonize the representations of diverse data types. Our experimental evaluations on public datasets demonstrate that MIN surpasses other strong DTI prediction methods. Furthermore, the case study reveals a high overlap between the residues selected by the C-Score Predictor and those in actual binding pockets, underscoring MIN's explainability capability. These findings affirm that MIN is not only a potent tool for DTI prediction but also offers fresh insights into the prediction of protein binding sites. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07778v1-abstract-full').style.display = 'none'; document.getElementById('2412.07778v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07618">arXiv:2412.07618</a> <span> [<a href="https://arxiv.org/pdf/2412.07618">pdf</a>, <a href="https://arxiv.org/format/2412.07618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Adapting to Non-Stationary Environments: Multi-Armed Bandit Enhanced Retrieval-Augmented Generation on Knowledge Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiaqiang Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+N">Nan Du</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Sihong Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07618v2-abstract-short" style="display: inline;"> Despite the superior performance of Large language models on many NLP tasks, they still face significant limitations in memorizing extensive world knowledge. Recent studies have demonstrated that leveraging the Retrieval-Augmented Generation (RAG) framework, combined with Knowledge Graphs that encapsulate extensive factual data in a structured format, robustly enhances the reasoning capabilities o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07618v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07618v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07618v2-abstract-full" style="display: none;"> Despite the superior performance of Large language models on many NLP tasks, they still face significant limitations in memorizing extensive world knowledge. Recent studies have demonstrated that leveraging the Retrieval-Augmented Generation (RAG) framework, combined with Knowledge Graphs that encapsulate extensive factual data in a structured format, robustly enhances the reasoning capabilities of LLMs. However, deploying such systems in real-world scenarios presents challenges: the continuous evolution of non-stationary environments may lead to performance degradation and user satisfaction requires a careful balance of performance and responsiveness. To address these challenges, we introduce a Multi-objective Multi-Armed Bandit enhanced RAG framework, supported by multiple retrieval methods with diverse capabilities under rich and evolving retrieval contexts in practice. Within this framework, each retrieval method is treated as a distinct ``arm''. The system utilizes real-time user feedback to adapt to dynamic environments, by selecting the appropriate retrieval method based on input queries and the historical multi-objective performance of each arm. Extensive experiments conducted on two benchmark KGQA datasets demonstrate that our method significantly outperforms baseline methods in non-stationary settings while achieving state-of-the-art performance in stationary environments. Code and data are available at https://github.com/FUTUREEEEEE/Dynamic-RAG.git <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07618v2-abstract-full').style.display = 'none'; document.getElementById('2412.07618v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05808">arXiv:2412.05808</a> <span> [<a href="https://arxiv.org/pdf/2412.05808">pdf</a>, <a href="https://arxiv.org/format/2412.05808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> SizeGS: Size-aware Compression of 3D Gaussians with Hierarchical Mixed Precision Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shuzhao Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiahang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weixiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+S">Shijia Ge</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Sicheng Pan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yunpeng Bai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05808v1-abstract-short" style="display: inline;"> Effective compression technology is crucial for 3DGS to adapt to varying storage and transmission conditions. However, existing methods fail to address size constraints while maintaining optimal quality. In this paper, we introduce SizeGS, a framework that compresses 3DGS within a specified size budget while optimizing visual quality. We start with a size estimator to establish a clear relationshi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05808v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05808v1-abstract-full" style="display: none;"> Effective compression technology is crucial for 3DGS to adapt to varying storage and transmission conditions. However, existing methods fail to address size constraints while maintaining optimal quality. In this paper, we introduce SizeGS, a framework that compresses 3DGS within a specified size budget while optimizing visual quality. We start with a size estimator to establish a clear relationship between file size and hyperparameters. Leveraging this estimator, we incorporate mixed precision quantization (MPQ) into 3DGS attributes, structuring MPQ in two hierarchical level -- inter-attribute and intra-attribute -- to optimize visual quality under the size constraint. At the inter-attribute level, we assign bit-widths to each attribute channel by formulating the combinatorial optimization as a 0-1 integer linear program, which can be efficiently solved. At the intra-attribute level, we divide each attribute channel into blocks of vectors, quantizing each vector based on the optimal bit-width derived at the inter-attribute level. Dynamic programming determines block lengths. Using the size estimator and MPQ, we develop a calibrated algorithm to identify optimal hyperparameters in just 10 minutes, achieving a 1.69$\times$ efficiency increase with quality comparable to state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05808v1-abstract-full').style.display = 'none'; document.getElementById('2412.05808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Automatically compressing 3DGS into the desired file size while maximizing the visual quality</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05695">arXiv:2412.05695</a> <span> [<a href="https://arxiv.org/pdf/2412.05695">pdf</a>, <a href="https://arxiv.org/format/2412.05695">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> WATER-GS: Toward Copyright Protection for 3D Gaussian Splatting via Universal Watermarking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yuqi Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shuzhao Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05695v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has emerged as a pivotal technique for 3D scene representation, providing rapid rendering speeds and high fidelity. As 3DGS gains prominence, safeguarding its intellectual property becomes increasingly crucial since 3DGS could be used to imitate unauthorized scene creations and raise copyright issues. Existing watermarking methods for implicit NeRFs cannot be directly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05695v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05695v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05695v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has emerged as a pivotal technique for 3D scene representation, providing rapid rendering speeds and high fidelity. As 3DGS gains prominence, safeguarding its intellectual property becomes increasingly crucial since 3DGS could be used to imitate unauthorized scene creations and raise copyright issues. Existing watermarking methods for implicit NeRFs cannot be directly applied to 3DGS due to its explicit representation and real-time rendering process, leaving watermarking for 3DGS largely unexplored. In response, we propose WATER-GS, a novel method designed to protect 3DGS copyrights through a universal watermarking strategy. First, we introduce a pre-trained watermark decoder, treating raw 3DGS generative modules as potential watermark encoders to ensure imperceptibility. Additionally, we implement novel 3D distortion layers to enhance the robustness of the embedded watermark against common real-world distortions of point cloud data. Comprehensive experiments and ablation studies demonstrate that WATER-GS effectively embeds imperceptible and robust watermarks into 3DGS without compromising rendering efficiency and quality. Our experiments indicate that the 3D distortion layers can yield up to a 20% improvement in accuracy rate. Notably, our method is adaptable to different 3DGS variants, including 3DGS compression frameworks and 2D Gaussian splatting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05695v1-abstract-full').style.display = 'none'; document.getElementById('2412.05695v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04908">arXiv:2412.04908</a> <span> [<a href="https://arxiv.org/pdf/2412.04908">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> PERCY: A Multimodal Dataset and Conversational System for Personalized and Emotionally Aware Human-Robot Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Althubyani%2C+M">Mohammed Althubyani</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Z">Zhijin Meng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shengyuan Xie</a>, <a href="/search/cs?searchtype=author&query=Seung%2C+C">Cha Seung</a>, <a href="/search/cs?searchtype=author&query=Razzak%2C+I">Imran Razzak</a>, <a href="/search/cs?searchtype=author&query=Sandoval%2C+E+B">Eduardo Benitez Sandoval</a>, <a href="/search/cs?searchtype=author&query=Kocaballi%2C+B">Baki Kocaballi</a>, <a href="/search/cs?searchtype=author&query=Bamdad%2C+M">Mahdi Bamdad</a>, <a href="/search/cs?searchtype=author&query=Naranjo%2C+F+C">Francisco Cruz Naranjo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04908v1-abstract-short" style="display: inline;"> The integration of conversational agents into our daily lives has become increasingly common, yet many of these agents cannot engage in deep interactions with humans. Despite this, there is a noticeable shortage of datasets that capture multimodal information from human-robot interaction dialogues. To address this gap, we have developed a Personal Emotional Robotic Conversational sYstem (PERCY) an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04908v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04908v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04908v1-abstract-full" style="display: none;"> The integration of conversational agents into our daily lives has become increasingly common, yet many of these agents cannot engage in deep interactions with humans. Despite this, there is a noticeable shortage of datasets that capture multimodal information from human-robot interaction dialogues. To address this gap, we have developed a Personal Emotional Robotic Conversational sYstem (PERCY) and recorded a novel multimodal dataset that encompasses rich embodied interaction data. The process involved asking participants to complete a questionnaire and gathering their profiles on ten topics, such as hobbies and favourite music. Subsequently, we initiated conversations between the robot and the participants, leveraging GPT-4 to generate contextually appropriate responses based on the participant's profile and emotional state, as determined by facial expression recognition and sentiment analysis. Automatic and user evaluations were conducted to assess the overall quality of the collected data. The results of both evaluations indicated a high level of naturalness, engagement, fluency, consistency, and relevance in the conversation, as well as the robot's ability to provide empathetic responses. It is worth noting that the dataset is derived from genuine interactions with the robot, involving participants who provided personal information and conveyed actual emotions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04908v1-abstract-full').style.display = 'none'; document.getElementById('2412.04908v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 5 Figures, Rejected from International Conference of Human Robot Interaction 2025, Melbourne, Australia</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> K.4.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01572">arXiv:2412.01572</a> <span> [<a href="https://arxiv.org/pdf/2412.01572">pdf</a>, <a href="https://arxiv.org/format/2412.01572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MBA-RAG: a Bandit Approach for Adaptive Retrieval-Augmented Generation through Question Complexity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiaqiang Tang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qiang Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+N">Nan Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Sihong Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01572v4-abstract-short" style="display: inline;"> Retrieval Augmented Generation (RAG) has proven to be highly effective in boosting the generative performance of language model in knowledge-intensive tasks. However, existing RAG framework either indiscriminately perform retrieval or rely on rigid single-class classifiers to select retrieval methods, leading to inefficiencies and suboptimal performance across queries of varying complexity. To add… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01572v4-abstract-full').style.display = 'inline'; document.getElementById('2412.01572v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01572v4-abstract-full" style="display: none;"> Retrieval Augmented Generation (RAG) has proven to be highly effective in boosting the generative performance of language model in knowledge-intensive tasks. However, existing RAG framework either indiscriminately perform retrieval or rely on rigid single-class classifiers to select retrieval methods, leading to inefficiencies and suboptimal performance across queries of varying complexity. To address these challenges, we propose a reinforcement learning-based framework that dynamically selects the most suitable retrieval strategy based on query complexity. % our solution Our approach leverages a multi-armed bandit algorithm, which treats each retrieval method as a distinct ``arm'' and adapts the selection process by balancing exploration and exploitation. Additionally, we introduce a dynamic reward function that balances accuracy and efficiency, penalizing methods that require more retrieval steps, even if they lead to a correct result. Our method achieves new state of the art results on multiple single-hop and multi-hop datasets while reducing retrieval costs. Our code are available at https://github.com/FUTUREEEEEE/MBA . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01572v4-abstract-full').style.display = 'none'; document.getElementById('2412.01572v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">COLING 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01207">arXiv:2412.01207</a> <span> [<a href="https://arxiv.org/pdf/2412.01207">pdf</a>, <a href="https://arxiv.org/format/2412.01207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Siamese Machine Unlearning with Knowledge Vaporization and Concentration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Songjie Xie</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Hengtao He</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shenghui Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Letaief%2C+K+B">Khaled B. Letaief</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01207v1-abstract-short" style="display: inline;"> In response to the practical demands of the ``right to be forgotten" and the removal of undesired data, machine unlearning emerges as an essential technique to remove the learned knowledge of a fraction of data points from trained models. However, existing methods suffer from limitations such as insufficient methodological support, high computational complexity, and significant memory demands. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01207v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01207v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01207v1-abstract-full" style="display: none;"> In response to the practical demands of the ``right to be forgotten" and the removal of undesired data, machine unlearning emerges as an essential technique to remove the learned knowledge of a fraction of data points from trained models. However, existing methods suffer from limitations such as insufficient methodological support, high computational complexity, and significant memory demands. In this work, we propose the concepts of knowledge vaporization and concentration to selectively erase learned knowledge from specific data points while maintaining representations for the remaining data. Utilizing the Siamese networks, we exemplify the proposed concepts and develop an efficient method for machine unlearning. Our proposed Siamese unlearning method does not require additional memory overhead and full access to the remaining dataset. Extensive experiments conducted across multiple unlearning scenarios showcase the superiority of Siamese unlearning over baseline methods, illustrating its ability to effectively remove knowledge from forgetting data, enhance model utility on remaining data, and reduce susceptibility to membership inference attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01207v1-abstract-full').style.display = 'none'; document.getElementById('2412.01207v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00862">arXiv:2412.00862</a> <span> [<a href="https://arxiv.org/pdf/2412.00862">pdf</a>, <a href="https://arxiv.org/format/2412.00862">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Toward Real-Time Edge AI: Model-Agnostic Task-Oriented Communication with Visual Feature Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Songjie Xie</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Hengtao He</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shenghui Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Letaief%2C+K+B">Khaled B. Letaief</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00862v1-abstract-short" style="display: inline;"> Task-oriented communication presents a promising approach to improve the communication efficiency of edge inference systems by optimizing learning-based modules to extract and transmit relevant task information. However, real-time applications face practical challenges, such as incomplete coverage and potential malfunctions of edge servers. This situation necessitates cross-model communication bet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00862v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00862v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00862v1-abstract-full" style="display: none;"> Task-oriented communication presents a promising approach to improve the communication efficiency of edge inference systems by optimizing learning-based modules to extract and transmit relevant task information. However, real-time applications face practical challenges, such as incomplete coverage and potential malfunctions of edge servers. This situation necessitates cross-model communication between different inference systems, enabling edge devices from one service provider to collaborate effectively with edge servers from another. Independent optimization of diverse edge systems often leads to incoherent feature spaces, which hinders the cross-model inference for existing task-oriented communication. To facilitate and achieve effective cross-model task-oriented communication, this study introduces a novel framework that utilizes shared anchor data across diverse systems. This approach addresses the challenge of feature alignment in both server-based and on-device scenarios. In particular, by leveraging the linear invariance of visual features, we propose efficient server-based feature alignment techniques to estimate linear transformations using encoded anchor data features. For on-device alignment, we exploit the angle-preserving nature of visual features and propose to encode relative representations with anchor data to streamline cross-model communication without additional alignment procedures during the inference. The experimental results on computer vision benchmarks demonstrate the superior performance of the proposed feature alignment approaches in cross-model task-oriented communications. The runtime and computation overhead analysis further confirm the effectiveness of the proposed feature alignment approaches in real-time applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00862v1-abstract-full').style.display = 'none'; document.getElementById('2412.00862v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13902">arXiv:2411.13902</a> <span> [<a href="https://arxiv.org/pdf/2411.13902">pdf</a>, <a href="https://arxiv.org/format/2411.13902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PIORS: Personalized Intelligent Outpatient Reception based on Large Language Model with Multi-Agents Medical Scenario Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+Z">Zhijie Bao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingyun Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Ying Guo</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zhengqiang Ye</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jun Shen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shirong Xie</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jiajie Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhongyu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13902v1-abstract-short" style="display: inline;"> In China, receptionist nurses face overwhelming workloads in outpatient settings, limiting their time and attention for each patient and ultimately reducing service quality. In this paper, we present the Personalized Intelligent Outpatient Reception System (PIORS). This system integrates an LLM-based reception nurse and a collaboration between LLM and hospital information system (HIS) into real ou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13902v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13902v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13902v1-abstract-full" style="display: none;"> In China, receptionist nurses face overwhelming workloads in outpatient settings, limiting their time and attention for each patient and ultimately reducing service quality. In this paper, we present the Personalized Intelligent Outpatient Reception System (PIORS). This system integrates an LLM-based reception nurse and a collaboration between LLM and hospital information system (HIS) into real outpatient reception setting, aiming to deliver personalized, high-quality, and efficient reception services. Additionally, to enhance the performance of LLMs in real-world healthcare scenarios, we propose a medical conversational data generation framework named Service Flow aware Medical Scenario Simulation (SFMSS), aiming to adapt the LLM to the real-world environments and PIORS settings. We evaluate the effectiveness of PIORS and SFMSS through automatic and human assessments involving 15 users and 15 clinical experts. The results demonstrate that PIORS-Nurse outperforms all baselines, including the current state-of-the-art model GPT-4o, and aligns with human preferences and clinical needs. Further details and demo can be found at https://github.com/FudanDISC/PIORS <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13902v1-abstract-full').style.display = 'none'; document.getElementById('2411.13902v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07150">arXiv:2411.07150</a> <span> [<a href="https://arxiv.org/pdf/2411.07150">pdf</a>, <a href="https://arxiv.org/format/2411.07150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Variational Graph Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shifeng Xie</a>, <a href="/search/cs?searchtype=author&query=Giraldo%2C+J+H">Jhony H. Giraldo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07150v1-abstract-short" style="display: inline;"> Graph representation learning (GRL) is a fundamental task in machine learning, aiming to encode high-dimensional graph-structured data into low-dimensional vectors. Self-supervised learning (SSL) methods are widely used in GRL because they can avoid expensive human annotation. In this work, we propose a novel Subgraph Gaussian Embedding Contrast (SGEC) method. Our approach introduces a subgraph Ga… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07150v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07150v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07150v1-abstract-full" style="display: none;"> Graph representation learning (GRL) is a fundamental task in machine learning, aiming to encode high-dimensional graph-structured data into low-dimensional vectors. Self-supervised learning (SSL) methods are widely used in GRL because they can avoid expensive human annotation. In this work, we propose a novel Subgraph Gaussian Embedding Contrast (SGEC) method. Our approach introduces a subgraph Gaussian embedding module, which adaptively maps subgraphs to a structured Gaussian space, ensuring the preservation of graph characteristics while controlling the distribution of generated subgraphs. We employ optimal transport distances, including Wasserstein and Gromov-Wasserstein distances, to effectively measure the similarity between subgraphs, enhancing the robustness of the contrastive learning process. Extensive experiments across multiple benchmarks demonstrate that SGEC outperforms or presents competitive performance against state-of-the-art approaches. Our findings provide insights into the design of SSL methods for GRL, emphasizing the importance of the distribution of the generated contrastive pairs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07150v1-abstract-full').style.display = 'none'; document.getElementById('2411.07150v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24022">arXiv:2410.24022</a> <span> [<a href="https://arxiv.org/pdf/2410.24022">pdf</a>, <a href="https://arxiv.org/format/2410.24022">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SFM-Protein: Integrative Co-evolutionary Pre-training for Advanced Protein Sequence Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Peiran Jin</a>, <a href="/search/cs?searchtype=author&query=Min%2C+Y">Yaosen Min</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shufang Xie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lijun Wu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+T">Tao Qin</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaozhuan Liang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kaiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuliang Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tie-Yan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24022v1-abstract-short" style="display: inline;"> Proteins, essential to biological systems, perform functions intricately linked to their three-dimensional structures. Understanding the relationship between protein structures and their amino acid sequences remains a core challenge in protein modeling. While traditional protein foundation models benefit from pre-training on vast unlabeled datasets, they often struggle to capture critical co-evolu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24022v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24022v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24022v1-abstract-full" style="display: none;"> Proteins, essential to biological systems, perform functions intricately linked to their three-dimensional structures. Understanding the relationship between protein structures and their amino acid sequences remains a core challenge in protein modeling. While traditional protein foundation models benefit from pre-training on vast unlabeled datasets, they often struggle to capture critical co-evolutionary information, which evolutionary-based methods excel at. In this study, we introduce a novel pre-training strategy for protein foundation models that emphasizes the interactions among amino acid residues to enhance the extraction of both short-range and long-range co-evolutionary features from sequence data. Trained on a large-scale protein sequence dataset, our model demonstrates superior generalization ability, outperforming established baselines of similar size, including the ESM model, across diverse downstream tasks. Experimental results confirm the model's effectiveness in integrating co-evolutionary information, marking a significant step forward in protein sequence-based modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24022v1-abstract-full').style.display = 'none'; document.getElementById('2410.24022v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22217">arXiv:2410.22217</a> <span> [<a href="https://arxiv.org/pdf/2410.22217">pdf</a>, <a href="https://arxiv.org/format/2410.22217">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Unifying Understanding and Generation in the Era of Vision Foundation Models: A Survey from the Autoregression Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shenghao Xie</a>, <a href="/search/cs?searchtype=author&query=Zu%2C+W">Wenqiang Zu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mingyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+D">Duo Su</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shilong Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+R">Ruohua Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lei Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22217v2-abstract-short" style="display: inline;"> Autoregression in large language models (LLMs) has shown impressive scalability by unifying all language tasks into the next token prediction paradigm. Recently, there is a growing interest in extending this success to vision foundation models. In this survey, we review the recent advances and discuss future directions for autoregressive vision foundation models. First, we present the trend for ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22217v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22217v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22217v2-abstract-full" style="display: none;"> Autoregression in large language models (LLMs) has shown impressive scalability by unifying all language tasks into the next token prediction paradigm. Recently, there is a growing interest in extending this success to vision foundation models. In this survey, we review the recent advances and discuss future directions for autoregressive vision foundation models. First, we present the trend for next generation of vision foundation models, i.e., unifying both understanding and generation in vision tasks. We then analyze the limitations of existing vision foundation models, and present a formal definition of autoregression with its advantages. Later, we categorize autoregressive vision foundation models from their vision tokenizers and autoregression backbones. Finally, we discuss several promising research challenges and directions. To the best of our knowledge, this is the first survey to comprehensively summarize autoregressive vision foundation models under the trend of unifying understanding and generation. A collection of related resources is available at https://github.com/EmmaSRH/ARVFM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22217v2-abstract-full').style.display = 'none'; document.getElementById('2410.22217v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 1 table, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18810">arXiv:2410.18810</a> <span> [<a href="https://arxiv.org/pdf/2410.18810">pdf</a>, <a href="https://arxiv.org/format/2410.18810">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> TangibleChannel: An Innovative Data Physicalization System for Visual Channel Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Siqi Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lingyun Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18810v1-abstract-short" style="display: inline;"> In this paper, we provide an overview of our attempts to harness data physicalizations as pedagogical tools for enhancing the understanding of visual channels. We first elaborate the research goals that we have crafted for the physicalization prototype, shedding light on the key principles that guided our design choices. Then we detail the materials and datasets we employed for nine channels on ou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18810v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18810v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18810v1-abstract-full" style="display: none;"> In this paper, we provide an overview of our attempts to harness data physicalizations as pedagogical tools for enhancing the understanding of visual channels. We first elaborate the research goals that we have crafted for the physicalization prototype, shedding light on the key principles that guided our design choices. Then we detail the materials and datasets we employed for nine channels on our physicalization prototype. A preliminary pilot study is followed to validate its effectiveness. In the end, we present our upcoming research initiatives, including a comparative study for assessing the usability of the physicalization system. In general, the main purpose of our work is to stimulate a wider engagement among visualization educators and researchers, encouraging them to delve into the potentialities of data physicalization as an innovative addition to contemporary teaching methodologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18810v1-abstract-full').style.display = 'none'; document.getElementById('2410.18810v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 pages, 1 figure, IEEE VIS 2023 Poster</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17251">arXiv:2410.17251</a> <span> [<a href="https://arxiv.org/pdf/2410.17251">pdf</a>, <a href="https://arxiv.org/format/2410.17251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Altogether: Image Captioning via Re-aligning Alt-text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hu Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Po-Yao Huang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X+E">Xiaoqing Ellen Tan</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+C">Ching-Feng Yeh</a>, <a href="/search/cs?searchtype=author&query=Kahn%2C+J">Jacob Kahn</a>, <a href="/search/cs?searchtype=author&query=Jou%2C+C">Christine Jou</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+G">Gargi Ghosh</a>, <a href="/search/cs?searchtype=author&query=Levy%2C+O">Omer Levy</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shang-Wen Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a>, <a href="/search/cs?searchtype=author&query=Feichtenhofer%2C+C">Christoph Feichtenhofer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17251v3-abstract-short" style="display: inline;"> This paper focuses on creating synthetic data to improve the quality of image captions. Existing works typically have two shortcomings. First, they caption images from scratch, ignoring existing alt-text metadata, and second, lack transparency if the captioners' training data (e.g. GPT) is unknown. In this paper, we study a principled approach Altogether based on the key idea to edit and re-align… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17251v3-abstract-full').style.display = 'inline'; document.getElementById('2410.17251v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17251v3-abstract-full" style="display: none;"> This paper focuses on creating synthetic data to improve the quality of image captions. Existing works typically have two shortcomings. First, they caption images from scratch, ignoring existing alt-text metadata, and second, lack transparency if the captioners' training data (e.g. GPT) is unknown. In this paper, we study a principled approach Altogether based on the key idea to edit and re-align existing alt-texts associated with the images. To generate training data, we perform human annotation where annotators start with the existing alt-text and re-align it to the image content in multiple rounds, consequently constructing captions with rich visual concepts. This differs from prior work that carries out human annotation as a one-time description task solely based on images and annotator knowledge. We train a captioner on this data that generalizes the process of re-aligning alt-texts at scale. Our results show our Altogether approach leads to richer image captions that also improve text-to-image generation and zero-shot image classification tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17251v3-abstract-full').style.display = 'none'; document.getElementById('2410.17251v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by EMNLP 2024; Meta CLIP 1.2 Data Engine</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13486">arXiv:2410.13486</a> <span> [<a href="https://arxiv.org/pdf/2410.13486">pdf</a>, <a href="https://arxiv.org/format/2410.13486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SemSim: Revisiting Weak-to-Strong Consistency from a Semantic Similarity Perspective for Semi-supervised Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shiao Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongyi Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Z">Ziwei Niu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+S">Shuyi Ouyang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Lanfen Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13486v1-abstract-short" style="display: inline;"> Semi-supervised learning (SSL) for medical image segmentation is a challenging yet highly practical task, which reduces reliance on large-scale labeled dataset by leveraging unlabeled samples. Among SSL techniques, the weak-to-strong consistency framework, popularized by FixMatch, has emerged as a state-of-the-art method in classification tasks. Notably, such a simple pipeline has also shown compe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13486v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13486v1-abstract-full" style="display: none;"> Semi-supervised learning (SSL) for medical image segmentation is a challenging yet highly practical task, which reduces reliance on large-scale labeled dataset by leveraging unlabeled samples. Among SSL techniques, the weak-to-strong consistency framework, popularized by FixMatch, has emerged as a state-of-the-art method in classification tasks. Notably, such a simple pipeline has also shown competitive performance in medical image segmentation. However, two key limitations still persist, impeding its efficient adaptation: (1) the neglect of contextual dependencies results in inconsistent predictions for similar semantic features, leading to incomplete object segmentation; (2) the lack of exploitation of semantic similarity between labeled and unlabeled data induces considerable class-distribution discrepancy. To address these limitations, we propose a novel semi-supervised framework based on FixMatch, named SemSim, powered by two appealing designs from semantic similarity perspective: (1) rectifying pixel-wise prediction by reasoning about the intra-image pair-wise affinity map, thus integrating contextual dependencies explicitly into the final prediction; (2) bridging labeled and unlabeled data via a feature querying mechanism for compact class representation learning, which fully considers cross-image anatomical similarities. As the reliable semantic similarity extraction depends on robust features, we further introduce an effective spatial-aware fusion module (SFM) to explore distinctive information from multiple scales. Extensive experiments show that SemSim yields consistent improvements over the state-of-the-art methods across three public segmentation benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13486v1-abstract-full').style.display = 'none'; document.getElementById('2410.13486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11341">arXiv:2410.11341</a> <span> [<a href="https://arxiv.org/pdf/2410.11341">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Using Zone Inflation and Volume Transfer to Design a Fabric-based Pneumatic Exosuit with both Efficiency and Wearability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chendong Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dapeng Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiachen Chen</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yiming Dai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Li Jiang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shengquan Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11341v1-abstract-short" style="display: inline;"> Fabric-based pneumatic exosuits have a broad application prospect due to their good human-machine interaction performance, but their structural design paradigm has not yet been finalized and requires in-depth research. This paper proposes the concepts of zone inflation and volume transfer for the design of a fabric-based pneumatic exosuit with both efficiency and wearability. The meaning of zone i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11341v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11341v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11341v1-abstract-full" style="display: none;"> Fabric-based pneumatic exosuits have a broad application prospect due to their good human-machine interaction performance, but their structural design paradigm has not yet been finalized and requires in-depth research. This paper proposes the concepts of zone inflation and volume transfer for the design of a fabric-based pneumatic exosuit with both efficiency and wearability. The meaning of zone inflation is to divide the inflation area of pneumatic exosuit into inflation-deflation zone and inflation-holding zone which can reduce the consumption of compressed air and improve efficiency. Volume transfer, a strategic distribution method of inflatable regions inside the garment, can effectively enhance the wearability of the exosuit. Using inexpensive thermoplastic polyurethane film and clothing fabric, the exosuit is made by heat pressing and sewing. The exosuit has a response time of 0.5s, a stress area of 1500mm2, and a profile of only 32mm, which can be hidden inside common clothing. A mathematical model is developed to predict the output torque of the exosuit with an error of 3.6%. Mechanical experiments show that the exosuit outputs a torque of 9.1Nm at a pressure of 100kPa. Surface electromyography experiments show that the exosuit can provide users with a boost from sitting to standing, with an average reduction in electromyography signals of 14.95%. The exosuit designed using these methods synthesizes efficiency and wearability and is expected to be an ideal paradigm for fabric-based pneumatic exosuits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11341v1-abstract-full').style.display = 'none'; document.getElementById('2410.11341v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08198">arXiv:2410.08198</a> <span> [<a href="https://arxiv.org/pdf/2410.08198">pdf</a>, <a href="https://arxiv.org/format/2410.08198">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adam Exploits $\ell_\infty$-geometry of Loss Landscape via Coordinate-wise Adaptivity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shuo Xie</a>, <a href="/search/cs?searchtype=author&query=Mohamadi%2C+M+A">Mohamad Amin Mohamadi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08198v1-abstract-short" style="display: inline;"> Adam outperforms SGD when training language models. Yet this advantage is not well-understood theoretically -- previous convergence analysis for Adam and SGD mainly focuses on the number of steps $T$ and is already minimax-optimal in non-convex cases, which are both $\widetilde{O}(T^{-1/4})$. In this work, we argue that the exploitation of nice $\ell_\infty$-geometry is the key advantage of Adam o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08198v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08198v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08198v1-abstract-full" style="display: none;"> Adam outperforms SGD when training language models. Yet this advantage is not well-understood theoretically -- previous convergence analysis for Adam and SGD mainly focuses on the number of steps $T$ and is already minimax-optimal in non-convex cases, which are both $\widetilde{O}(T^{-1/4})$. In this work, we argue that the exploitation of nice $\ell_\infty$-geometry is the key advantage of Adam over SGD. More specifically, we give a new convergence analysis for Adam under novel assumptions that loss is smooth under $\ell_\infty$-geometry rather than the more common $\ell_2$-geometry, which yields a much better empirical smoothness constant for GPT-2 and ResNet models. Our experiments confirm that Adam performs much worse when the favorable $\ell_\infty$-geometry is changed while SGD provably remains unaffected. We also extend the convergence analysis to blockwise Adam under novel blockwise smoothness assumptions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08198v1-abstract-full').style.display = 'none'; document.getElementById('2410.08198v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07523">arXiv:2410.07523</a> <span> [<a href="https://arxiv.org/pdf/2410.07523">pdf</a>, <a href="https://arxiv.org/format/2410.07523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DemoShapley: Valuation of Demonstrations for In-Context Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shan Xie</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M">Man Luo</a>, <a href="/search/cs?searchtype=author&query=Stern%2C+C+D">Chadly Daniel Stern</a>, <a href="/search/cs?searchtype=author&query=Du%2C+M">Mengnan Du</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lu Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07523v1-abstract-short" style="display: inline;"> Large language models (LLMs) leveraging in-context learning (ICL) have set new benchmarks in few-shot learning across various tasks without needing task-specific fine-tuning. However, extensive research has demonstrated that the effectiveness of ICL is significantly influenced by the selection and ordering of demonstrations. Considering the critical role of demonstration selection in ICL, we intro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07523v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07523v1-abstract-full" style="display: none;"> Large language models (LLMs) leveraging in-context learning (ICL) have set new benchmarks in few-shot learning across various tasks without needing task-specific fine-tuning. However, extensive research has demonstrated that the effectiveness of ICL is significantly influenced by the selection and ordering of demonstrations. Considering the critical role of demonstration selection in ICL, we introduce DemoShapley which is inspired by the Data Shapley valuation theorem. This approach assesses the influence of individual demonstration instances, distinguishing between those that contribute positively and those that may hinder performance. Our findings reveal that DemoShapley not only enhances model performance in terms of accuracy and fairness but also generalizes queries from domains distinct from those of the in-context demonstrations, highlighting its versatility and effectiveness in optimizing ICL demonstration selection. Last but not least, DemoShapley demonstrates its ability to aid in identifying noisy data within the demonstration set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07523v1-abstract-full').style.display = 'none'; document.getElementById('2410.07523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06940">arXiv:2410.06940</a> <span> [<a href="https://arxiv.org/pdf/2410.06940">pdf</a>, <a href="https://arxiv.org/format/2410.06940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Representation Alignment for Generation: Training Diffusion Transformers Is Easier Than You Think </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+S">Sihyun Yu</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+S">Sangkyung Kwak</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+H">Huiwon Jang</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongheon Jeong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jonathan Huang</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+J">Jinwoo Shin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06940v2-abstract-short" style="display: inline;"> Recent studies have shown that the denoising process in (generative) diffusion models can induce meaningful (discriminative) representations inside the model, though the quality of these representations still lags behind those learned through recent self-supervised learning methods. We argue that one main bottleneck in training large-scale diffusion models for generation lies in effectively learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06940v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06940v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06940v2-abstract-full" style="display: none;"> Recent studies have shown that the denoising process in (generative) diffusion models can induce meaningful (discriminative) representations inside the model, though the quality of these representations still lags behind those learned through recent self-supervised learning methods. We argue that one main bottleneck in training large-scale diffusion models for generation lies in effectively learning these representations. Moreover, training can be made easier by incorporating high-quality external visual representations, rather than relying solely on the diffusion models to learn them independently. We study this by introducing a straightforward regularization called REPresentation Alignment (REPA), which aligns the projections of noisy input hidden states in denoising networks with clean image representations obtained from external, pretrained visual encoders. The results are striking: our simple strategy yields significant improvements in both training efficiency and generation quality when applied to popular diffusion and flow-based transformers, such as DiTs and SiTs. For instance, our method can speed up SiT training by over 17.5$\times$, matching the performance (without classifier-free guidance) of a SiT-XL model trained for 7M steps in less than 400K steps. In terms of final generation quality, our approach achieves state-of-the-art results of FID=1.42 using classifier-free guidance with the guidance interval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06940v2-abstract-full').style.display = 'none'; document.getElementById('2410.06940v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Project page: https://sihyun.me/REPA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05694">arXiv:2410.05694</a> <span> [<a href="https://arxiv.org/pdf/2410.05694">pdf</a>, <a href="https://arxiv.org/format/2410.05694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiffusionGuard: A Robust Defense Against Malicious Diffusion-based Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+J+S">June Suk Choi</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kyungmin Lee</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongheon Jeong</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+J">Jinwoo Shin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kimin Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05694v1-abstract-short" style="display: inline;"> Recent advances in diffusion models have introduced a new era of text-guided image manipulation, enabling users to create realistic edited images with simple textual prompts. However, there is significant concern about the potential misuse of these methods, especially in creating misleading or harmful content. Although recent defense strategies, which introduce imperceptible adversarial noise to i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05694v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05694v1-abstract-full" style="display: none;"> Recent advances in diffusion models have introduced a new era of text-guided image manipulation, enabling users to create realistic edited images with simple textual prompts. However, there is significant concern about the potential misuse of these methods, especially in creating misleading or harmful content. Although recent defense strategies, which introduce imperceptible adversarial noise to induce model failure, have shown promise, they remain ineffective against more sophisticated manipulations, such as editing with a mask. In this work, we propose DiffusionGuard, a robust and effective defense method against unauthorized edits by diffusion-based image editing models, even in challenging setups. Through a detailed analysis of these models, we introduce a novel objective that generates adversarial noise targeting the early stage of the diffusion process. This approach significantly improves the efficiency and effectiveness of adversarial noises. We also introduce a mask-augmentation technique to enhance robustness against various masks during test time. Finally, we introduce a comprehensive benchmark designed to evaluate the effectiveness and robustness of methods in protecting against privacy threats in realistic scenarios. Through extensive experiments, we show that our method achieves stronger protection and improved mask robustness with lower computational costs compared to the strongest baseline. Additionally, our method exhibits superior transferability and better resilience to noise removal techniques compared to all baseline methods. Our source code is publicly available at https://github.com/choi403/DiffusionGuard. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05694v1-abstract-full').style.display = 'none'; document.getElementById('2410.05694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03051">arXiv:2410.03051</a> <span> [<a href="https://arxiv.org/pdf/2410.03051">pdf</a>, <a href="https://arxiv.org/format/2410.03051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AuroraCap: Efficient, Performant Video Detailed Captioning and a New Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/cs?searchtype=author&query=Song%2C+E">Enxin Song</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yilun Du</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+C">Chenlin Meng</a>, <a href="/search/cs?searchtype=author&query=Madhavan%2C+V">Vashisht Madhavan</a>, <a href="/search/cs?searchtype=author&query=Bar-Tal%2C+O">Omer Bar-Tal</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+J">Jeng-Neng Hwang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a>, <a href="/search/cs?searchtype=author&query=Manning%2C+C+D">Christopher D. Manning</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03051v1-abstract-short" style="display: inline;"> Video detailed captioning is a key task which aims to generate comprehensive and coherent textual descriptions of video content, benefiting both video understanding and generation. In this paper, we propose AuroraCap, a video captioner based on a large multimodal model. We follow the simplest architecture design without additional parameters for temporal modeling. To address the overhead caused by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03051v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03051v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03051v1-abstract-full" style="display: none;"> Video detailed captioning is a key task which aims to generate comprehensive and coherent textual descriptions of video content, benefiting both video understanding and generation. In this paper, we propose AuroraCap, a video captioner based on a large multimodal model. We follow the simplest architecture design without additional parameters for temporal modeling. To address the overhead caused by lengthy video sequences, we implement the token merging strategy, reducing the number of input visual tokens. Surprisingly, we found that this strategy results in little performance loss. AuroraCap shows superior performance on various video and image captioning benchmarks, for example, obtaining a CIDEr of 88.9 on Flickr30k, beating GPT-4V (55.3) and Gemini-1.5 Pro (82.2). However, existing video caption benchmarks only include simple descriptions, consisting of a few dozen words, which limits research in this field. Therefore, we develop VDC, a video detailed captioning benchmark with over one thousand carefully annotated structured captions. In addition, we propose a new LLM-assisted metric VDCscore for bettering evaluation, which adopts a divide-and-conquer strategy to transform long caption evaluation into multiple short question-answer pairs. With the help of human Elo ranking, our experiments show that this benchmark better correlates with human judgments of video detailed captioning quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03051v1-abstract-full').style.display = 'none'; document.getElementById('2410.03051v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code, docs, weight, benchmark and training data are all avaliable at \href{https://rese1f.github.io/aurora-web/}{website}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02571">arXiv:2410.02571</a> <span> [<a href="https://arxiv.org/pdf/2410.02571">pdf</a>, <a href="https://arxiv.org/format/2410.02571">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SuperGS: Super-Resolution 3D Gaussian Splatting Enhanced by Variational Residual Features and Uncertainty-Augmented Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shiyun Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiru Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yinghao Zhu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+C">Chengwei Pan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiwang Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02571v3-abstract-short" style="display: inline;"> Recently, 3D Gaussian Splatting (3DGS) has exceled in novel view synthesis (NVS) with its real-time rendering capabilities and superior quality. However, it faces challenges for high-resolution novel view synthesis (HRNVS) due to the coarse nature of primitives derived from low-resolution input views. To address this issue, we propose Super-Resolution 3DGS (SuperGS), which is an expansion of 3DGS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02571v3-abstract-full').style.display = 'inline'; document.getElementById('2410.02571v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02571v3-abstract-full" style="display: none;"> Recently, 3D Gaussian Splatting (3DGS) has exceled in novel view synthesis (NVS) with its real-time rendering capabilities and superior quality. However, it faces challenges for high-resolution novel view synthesis (HRNVS) due to the coarse nature of primitives derived from low-resolution input views. To address this issue, we propose Super-Resolution 3DGS (SuperGS), which is an expansion of 3DGS designed with a two-stage coarse-to-fine training framework. In this framework, we use a latent feature field to represent the low-resolution scene, serving as both the initialization and foundational information for super-resolution optimization. Additionally, we introduce variational residual features to enhance high-resolution details, using their variance as uncertainty estimates to guide the densification process and loss computation. Furthermore, the introduction of a multi-view joint learning approach helps mitigate ambiguities caused by multi-view inconsistencies in the pseudo labels. Extensive experiments demonstrate that SuperGS surpasses state-of-the-art HRNVS methods on both real-world and synthetic datasets using only low-resolution inputs. Code is available at https://github.com/SYXieee/SuperGS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02571v3-abstract-full').style.display = 'none'; document.getElementById('2410.02571v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01176">arXiv:2410.01176</a> <span> [<a href="https://arxiv.org/pdf/2410.01176">pdf</a>, <a href="https://arxiv.org/format/2410.01176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generative Diffusion-based Contract Design for Efficient AI Twins Migration in Vehicular Embodied AI Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yue Zhong</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jinbo Wen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+D">Dongdong Ye</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiangtian Nie</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaozheng Gao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shengli Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01176v1-abstract-short" style="display: inline;"> Embodied AI is a rapidly advancing field that bridges the gap between cyberspace and physical space, enabling a wide range of applications. This evolution has led to the development of the Vehicular Embodied AI NETwork (VEANET), where advanced AI capabilities are integrated into vehicular systems to enhance autonomous operations and decision-making. Embodied agents, such as Autonomous Vehicles (AV… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01176v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01176v1-abstract-full" style="display: none;"> Embodied AI is a rapidly advancing field that bridges the gap between cyberspace and physical space, enabling a wide range of applications. This evolution has led to the development of the Vehicular Embodied AI NETwork (VEANET), where advanced AI capabilities are integrated into vehicular systems to enhance autonomous operations and decision-making. Embodied agents, such as Autonomous Vehicles (AVs), are autonomous entities that can perceive their environment and take actions to achieve specific goals, actively interacting with the physical world. Embodied twins are digital models of these embodied agents, with various embodied AI twins for intelligent applications in cyberspace. In VEANET, embodied AI twins act as in-vehicle AI assistants to perform diverse tasks supporting autonomous driving using generative AI models. Due to limited computational resources of AVs, these AVs often offload computationally intensive tasks, such as constructing and updating embodied AI twins, to nearby RSUs. However, since the rapid mobility of AVs and the limited provision coverage of a single RSU, embodied AI twins require dynamic migrations from current RSU to other RSUs in real-time, resulting in the challenge of selecting suitable RSUs for efficient embodied AI twins migrations. Given information asymmetry, AVs cannot know the detailed information of RSUs. To this end, in this paper, we construct a multi-dimensional contract theoretical model between AVs and alternative RSUs. Considering that AVs may exhibit irrational behavior, we utilize prospect theory instead of expected utility theory to model the actual utilities of AVs. Finally, we employ a generative diffusion model-based algorithm to identify the optimal contract designs. Compared with traditional deep reinforcement learning algorithms, numerical results demonstrate the effectiveness of the proposed scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01176v1-abstract-full').style.display = 'none'; document.getElementById('2410.01176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19429">arXiv:2409.19429</a> <span> [<a href="https://arxiv.org/pdf/2409.19429">pdf</a>, <a href="https://arxiv.org/format/2409.19429">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fast Encoding and Decoding for Implicit Video Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+S">Ser-Nam Lim</a>, <a href="/search/cs?searchtype=author&query=Shrivastava%2C+A">Abhinav Shrivastava</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19429v2-abstract-short" style="display: inline;"> Despite the abundant availability and content richness for video data, its high-dimensionality poses challenges for video research. Recent advancements have explored the implicit representation for videos using neural networks, demonstrating strong performance in applications such as video compression and enhancement. However, the prolonged encoding time remains a persistent challenge for video Im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19429v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19429v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19429v2-abstract-full" style="display: none;"> Despite the abundant availability and content richness for video data, its high-dimensionality poses challenges for video research. Recent advancements have explored the implicit representation for videos using neural networks, demonstrating strong performance in applications such as video compression and enhancement. However, the prolonged encoding time remains a persistent challenge for video Implicit Neural Representations (INRs). In this paper, we focus on improving the speed of video encoding and decoding within implicit representations. We introduce two key components: NeRV-Enc, a transformer-based hyper-network for fast encoding; and NeRV-Dec, a parallel decoder for efficient video loading. NeRV-Enc achieves an impressive speed-up of $\mathbf{10^4\times}$ by eliminating gradient-based optimization. Meanwhile, NeRV-Dec simplifies video decoding, outperforming conventional codecs with a loading speed $\mathbf{11\times}$ faster, and surpassing RAM loading with pre-decoded videos ($\mathbf{2.5\times}$ faster while being $\mathbf{65\times}$ smaller in size). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19429v2-abstract-full').style.display = 'none'; document.getElementById('2409.19429v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024. Project page at https://haochen-rye.github.io/FastNeRV/, code will be at https://github.com/haochen-rye/FastNeRV</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18892">arXiv:2409.18892</a> <span> [<a href="https://arxiv.org/pdf/2409.18892">pdf</a>, <a href="https://arxiv.org/format/2409.18892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> IDGen: Item Discrimination Induced Prompt Generation for LLM Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fan Lin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shuyi Xie</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yong Dai</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Wenlin Yao</a>, <a href="/search/cs?searchtype=author&query=Lang%2C+T">Tianjiao Lang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zishan Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhichao Hu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xiao Xiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuhong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18892v2-abstract-short" style="display: inline;"> As Large Language Models (LLMs) grow increasingly adept at managing complex tasks, the evaluation set must keep pace with these advancements to ensure it remains sufficiently discriminative. Item Discrimination (ID) theory, which is widely used in educational assessment, measures the ability of individual test items to differentiate between high and low performers. Inspired by this theory, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18892v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18892v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18892v2-abstract-full" style="display: none;"> As Large Language Models (LLMs) grow increasingly adept at managing complex tasks, the evaluation set must keep pace with these advancements to ensure it remains sufficiently discriminative. Item Discrimination (ID) theory, which is widely used in educational assessment, measures the ability of individual test items to differentiate between high and low performers. Inspired by this theory, we propose an ID-induced prompt synthesis framework for evaluating LLMs to ensure the evaluation set can continually update and refine according to model abilities. Our data synthesis framework prioritizes both breadth and specificity. It can generate prompts that comprehensively evaluate the capabilities of LLMs while revealing meaningful performance differences between models, allowing for effective discrimination of their relative strengths and weaknesses across various tasks and domains. To produce high-quality data, we incorporate a self-correct mechanism into our generalization framework, and develop two models to predict prompt discrimination and difficulty score to facilitate our data synthesis framework, contributing valuable tools to evaluation data synthesis research. We apply our generated data to evaluate five SOTA models. Our data achieves an average score of 51.92, accompanied by a variance of 10.06. By contrast, previous works (i.e., SELF-INSTRUCT and WizardLM) obtain an average score exceeding 67, with a variance below 3.2. The results demonstrate that the data generated by our framework is more challenging and discriminative compared to previous works. We will release a dataset of over 3,000 carefully crafted prompts to facilitate evaluation research of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18892v2-abstract-full').style.display = 'none'; document.getElementById('2409.18892v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13648">arXiv:2409.13648</a> <span> [<a href="https://arxiv.org/pdf/2409.13648">pdf</a>, <a href="https://arxiv.org/format/2409.13648">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> V^3: Viewing Volumetric Videos on Mobiles via Streamable 2D Dynamic Gaussians </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+P">Penghao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhirui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liao Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+K">Kaixin Yao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Siyuan Xie</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jingyi Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Minye Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Lan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13648v2-abstract-short" style="display: inline;"> Experiencing high-fidelity volumetric video as seamlessly as 2D videos is a long-held dream. However, current dynamic 3DGS methods, despite their high rendering quality, face challenges in streaming on mobile devices due to computational and bandwidth constraints. In this paper, we introduce V^3 (Viewing Volumetric Videos), a novel approach that enables high-quality mobile rendering through the st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13648v2-abstract-full').style.display = 'inline'; document.getElementById('2409.13648v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13648v2-abstract-full" style="display: none;"> Experiencing high-fidelity volumetric video as seamlessly as 2D videos is a long-held dream. However, current dynamic 3DGS methods, despite their high rendering quality, face challenges in streaming on mobile devices due to computational and bandwidth constraints. In this paper, we introduce V^3 (Viewing Volumetric Videos), a novel approach that enables high-quality mobile rendering through the streaming of dynamic Gaussians. Our key innovation is to view dynamic 3DGS as 2D videos, facilitating the use of hardware video codecs. Additionally, we propose a two-stage training strategy to reduce storage requirements with rapid training speed. The first stage employs hash encoding and shallow MLP to learn motion, then reduces the number of Gaussians through pruning to meet the streaming requirements, while the second stage fine tunes other Gaussian attributes using residual entropy loss and temporal loss to improve temporal continuity. This strategy, which disentangles motion and appearance, maintains high rendering quality with compact storage requirements. Meanwhile, we designed a multi-platform player to decode and render 2D Gaussian videos. Extensive experiments demonstrate the effectiveness of V^3, outperforming other methods by enabling high-quality rendering and streaming on common devices, which is unseen before. As the first to stream dynamic Gaussians on mobile devices, our companion player offers users an unprecedented volumetric video experience, including smooth scrolling and instant sharing. Our project page with source code is available at https://authoritywang.github.io/v3/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13648v2-abstract-full').style.display = 'none'; document.getElementById('2409.13648v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10954">arXiv:2409.10954</a> <span> [<a href="https://arxiv.org/pdf/2409.10954">pdf</a>, <a href="https://arxiv.org/format/2409.10954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3689031.3696102">10.1145/3689031.3696102 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Ladon: High-Performance Multi-BFT Consensus via Dynamic Global Ordering (Extended Version) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Hanzheng Lyu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shaokang Xie</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Jianyu Niu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinqian Zhang</a>, <a href="/search/cs?searchtype=author&query=Beschastnikh%2C+I">Ivan Beschastnikh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10954v1-abstract-short" style="display: inline;"> Multi-BFT consensus runs multiple leader-based consensus instances in parallel, circumventing the leader bottleneck of a single instance. However, it contains an Achilles' heel: the need to globally order output blocks across instances. Deriving this global ordering is challenging because it must cope with different rates at which blocks are produced by instances. Prior Multi-BFT designs assign ea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10954v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10954v1-abstract-full" style="display: none;"> Multi-BFT consensus runs multiple leader-based consensus instances in parallel, circumventing the leader bottleneck of a single instance. However, it contains an Achilles' heel: the need to globally order output blocks across instances. Deriving this global ordering is challenging because it must cope with different rates at which blocks are produced by instances. Prior Multi-BFT designs assign each block a global index before creation, leading to poor performance. We propose Ladon, a high-performance Multi-BFT protocol that allows varying instance block rates. Our key idea is to order blocks across instances dynamically, which eliminates blocking on slow instances. We achieve dynamic global ordering by assigning monotonic ranks to blocks. We pipeline rank coordination with the consensus process to reduce protocol overhead and combine aggregate signatures with rank information to reduce message complexity. Ladon's dynamic ordering enables blocks to be globally ordered according to their generation, which respects inter-block causality. We implemented and evaluated Ladon by integrating it with both PBFT and HotStuff protocols. Our evaluation shows that Ladon-PBFT (resp., Ladon-HotStuff) improves the peak throughput of the prior art by $\approx$8x (resp., 2x) and reduces latency by $\approx$62% (resp., 23%), when deployed with one straggling replica (out of 128 replicas) in a WAN setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10954v1-abstract-full').style.display = 'none'; document.getElementById('2409.10954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09860">arXiv:2409.09860</a> <span> [<a href="https://arxiv.org/pdf/2409.09860">pdf</a>, <a href="https://arxiv.org/format/2409.09860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.14722/ndss.2025.23090">10.14722/ndss.2025.23090 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Revisiting Physical-World Adversarial Attack on Traffic Sign Recognition: A Commercial Systems Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+N">Ningfei Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shaoyuan Xie</a>, <a href="/search/cs?searchtype=author&query=Sato%2C+T">Takami Sato</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yunpeng Luo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kaidi Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q+A">Qi Alfred Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09860v1-abstract-short" style="display: inline;"> Traffic Sign Recognition (TSR) is crucial for safe and correct driving automation. Recent works revealed a general vulnerability of TSR models to physical-world adversarial attacks, which can be low-cost, highly deployable, and capable of causing severe attack effects such as hiding a critical traffic sign or spoofing a fake one. However, so far existing works generally only considered evaluating… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09860v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09860v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09860v1-abstract-full" style="display: none;"> Traffic Sign Recognition (TSR) is crucial for safe and correct driving automation. Recent works revealed a general vulnerability of TSR models to physical-world adversarial attacks, which can be low-cost, highly deployable, and capable of causing severe attack effects such as hiding a critical traffic sign or spoofing a fake one. However, so far existing works generally only considered evaluating the attack effects on academic TSR models, leaving the impacts of such attacks on real-world commercial TSR systems largely unclear. In this paper, we conduct the first large-scale measurement of physical-world adversarial attacks against commercial TSR systems. Our testing results reveal that it is possible for existing attack works from academia to have highly reliable (100\%) attack success against certain commercial TSR system functionality, but such attack capabilities are not generalizable, leading to much lower-than-expected attack success rates overall. We find that one potential major factor is a spatial memorization design that commonly exists in today's commercial TSR systems. We design new attack success metrics that can mathematically model the impacts of such design on the TSR system-level attack success, and use them to revisit existing attacks. Through these efforts, we uncover 7 novel observations, some of which directly challenge the observations or claims in prior works due to the introduction of the new metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09860v1-abstract-full').style.display = 'none'; document.getElementById('2409.09860v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NDSS 2025</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xie%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xie%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xie%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xie%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xie%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xie%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository