CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 721 results for author: <span class="mathjax">Gu, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Gu%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Gu, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Gu%2C+J&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Gu, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13715">arXiv:2411.13715</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13715">pdf</a>, <a href="https://arxiv.org/format/2411.13715">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SimPhony: A Device-Circuit-Architecture Cross-Layer Modeling and Simulation Framework for Heterogeneous Electronic-Photonic AI System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yin%2C+Z">Ziang Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Begovic%2C+A">Amir Begovic</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+R">Rena Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jeff Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13715v1-abstract-short" style="display: inline;"> Electronic-photonic integrated circuits (EPICs) offer transformative potential for next-generation high-performance AI but require interdisciplinary advances across devices, circuits, architecture, and design automation. The complexity of hybrid systems makes it challenging even for domain experts to understand distinct behaviors and interactions across design stack. The lack of a flexible, accura&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13715v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13715v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13715v1-abstract-full" style="display: none;"> Electronic-photonic integrated circuits (EPICs) offer transformative potential for next-generation high-performance AI but require interdisciplinary advances across devices, circuits, architecture, and design automation. The complexity of hybrid systems makes it challenging even for domain experts to understand distinct behaviors and interactions across design stack. The lack of a flexible, accurate, fast, and easy-to-use EPIC AI system simulation framework significantly limits the exploration of hardware innovations and system evaluations on common benchmarks. To address this gap, we propose SimPhony, a cross-layer modeling and simulation framework for heterogeneous electronic-photonic AI systems. SimPhony offers a platform that enables (1) generic, extensible hardware topology representation that supports heterogeneous multi-core architectures with diverse photonic tensor core designs; (2) optics-specific dataflow modeling with unique multi-dimensional parallelism and reuse beyond spatial/temporal dimensions; (3) data-aware energy modeling with realistic device responses, layout-aware area estimation, link budget analysis, and bandwidth-adaptive memory modeling; and (4) seamless integration with model training framework for hardware/software co-simulation. By providing a unified, versatile, and high-fidelity simulation platform, SimPhony enables researchers to innovate and evaluate EPIC AI hardware across multiple domains, facilitating the next leap in emerging AI hardware. We open-source our codes at https://github.com/ScopeX-ASU/SimPhony <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13715v1-abstract-full').style.display = 'none'; document.getElementById('2411.13715v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7-page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09820">arXiv:2411.09820</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09820">pdf</a>, <a href="https://arxiv.org/format/2411.09820">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> WelQrate: Defining the Gold Standard in Small Molecule Drug Discovery Benchmarking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yunchao"> Yunchao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu"> Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+H">Ha Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Moretti%2C+R">Rocco Moretti</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+Z">Zhaoqian Su</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiawei Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Bodenheimer%2C+B">Bobby Bodenheimer</a>, <a href="/search/cs?searchtype=author&amp;query=Weaver%2C+C+D">Charles David Weaver</a>, <a href="/search/cs?searchtype=author&amp;query=Meiler%2C+J">Jens Meiler</a>, <a href="/search/cs?searchtype=author&amp;query=Derr%2C+T">Tyler Derr</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09820v1-abstract-short" style="display: inline;"> While deep learning has revolutionized computer-aided drug discovery, the AI community has predominantly focused on model innovation and placed less emphasis on establishing best benchmarking practices. We posit that without a sound model evaluation framework, the AI community&#39;s efforts cannot reach their full potential, thereby slowing the progress and transfer of innovation into real-world drug&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09820v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09820v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09820v1-abstract-full" style="display: none;"> While deep learning has revolutionized computer-aided drug discovery, the AI community has predominantly focused on model innovation and placed less emphasis on establishing best benchmarking practices. We posit that without a sound model evaluation framework, the AI community&#39;s efforts cannot reach their full potential, thereby slowing the progress and transfer of innovation into real-world drug discovery. Thus, in this paper, we seek to establish a new gold standard for small molecule drug discovery benchmarking, WelQrate. Specifically, our contributions are threefold: WelQrate Dataset Collection - we introduce a meticulously curated collection of 9 datasets spanning 5 therapeutic target classes. Our hierarchical curation pipelines, designed by drug discovery experts, go beyond the primary high-throughput screen by leveraging additional confirmatory and counter screens along with rigorous domain-driven preprocessing, such as Pan-Assay Interference Compounds (PAINS) filtering, to ensure the high-quality data in the datasets; WelQrate Evaluation Framework - we propose a standardized model evaluation framework considering high-quality datasets, featurization, 3D conformation generation, evaluation metrics, and data splits, which provides a reliable benchmarking for drug discovery experts conducting real-world virtual screening; Benchmarking - we evaluate model performance through various research questions using the WelQrate dataset collection, exploring the effects of different models, dataset quality, featurization methods, and data splitting strategies on the results. In summary, we recommend adopting our proposed WelQrate as the gold standard in small molecule drug discovery benchmarking. The WelQrate dataset collection, along with the curation codes, and experimental scripts are all publicly available at WelQrate.org. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09820v1-abstract-full').style.display = 'none'; document.getElementById('2411.09820v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">* denotes equal contribution</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08210">arXiv:2411.08210</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08210">pdf</a>, <a href="https://arxiv.org/format/2411.08210">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> BOSON$^{-1}$: Understanding and Enabling Physically-Robust Photonic Inverse Design with Adaptive Variation-Aware Subspace Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Z">Zhengqi Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Begovic%2C+A">Amir Begovic</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haoyu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+H">Haoxing Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z+R">Zhaoran Rena Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Boning%2C+D">Duane Boning</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08210v1-abstract-short" style="display: inline;"> Nanophotonic device design aims to optimize photonic structures to meet specific requirements across various applications. Inverse design has unlocked non-intuitive, high-dimensional design spaces, enabling the discovery of high-performance devices beyond heuristic or analytic methods. The adjoint method, which calculates gradients for all variables using just two simulations, enables efficient na&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08210v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08210v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08210v1-abstract-full" style="display: none;"> Nanophotonic device design aims to optimize photonic structures to meet specific requirements across various applications. Inverse design has unlocked non-intuitive, high-dimensional design spaces, enabling the discovery of high-performance devices beyond heuristic or analytic methods. The adjoint method, which calculates gradients for all variables using just two simulations, enables efficient navigation of this complex space. However, many inverse-designed structures, while numerically plausible, are difficult to fabricate and sensitive to variations, limiting their practical use. The discrete nature with numerous local-optimal structures also pose significant optimization challenges, often causing gradient-based methods to converge on suboptimal designs. In this work, we formulate inverse design as a fabrication-restricted, discrete, probabilistic optimization problem and introduce BOSON-1, an end-to-end, variation-aware subspace optimization framework to address the challenges of manufacturability, robustness, and optimizability. To overcome optimization difficulty, we propose dense target-enhanced gradient flows to mitigate misleading local optima and introduce a conditional subspace optimization strategy to create high-dimensional tunnels to escape local optima. Furthermore, we significantly reduce the runtime associated with optimizing across exponential variation samples through an adaptive sampling-based robust optimization, ensuring both efficiency and variation robustness. On three representative photonic device benchmarks, our proposed inverse design methodology BOSON^-1 delivers fabricable structures and achieves the best convergence and performance under realistic variations, outperforming prior arts with 74.3% post-fabrication performance. We open-source our codes at https://github.com/ScopeX-ASU/BOSON. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08210v1-abstract-full').style.display = 'none'; document.getElementById('2411.08210v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages. Accepted IEEE DATE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05748">arXiv:2411.05748</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05748">pdf</a>, <a href="https://arxiv.org/format/2411.05748">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Multi-Dimensional Reconfigurable, Physically Composable Hybrid Diffractive Optical Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yin%2C+Z">Ziang Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yu Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jeff Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05748v1-abstract-short" style="display: inline;"> Diffractive optical neural networks (DONNs), leveraging free-space light wave propagation for ultra-parallel, high-efficiency computing, have emerged as promising artificial intelligence (AI) accelerators. However, their inherent lack of reconfigurability due to fixed optical structures post-fabrication hinders practical deployment in the face of dynamic AI workloads and evolving applications. To&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05748v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05748v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05748v1-abstract-full" style="display: none;"> Diffractive optical neural networks (DONNs), leveraging free-space light wave propagation for ultra-parallel, high-efficiency computing, have emerged as promising artificial intelligence (AI) accelerators. However, their inherent lack of reconfigurability due to fixed optical structures post-fabrication hinders practical deployment in the face of dynamic AI workloads and evolving applications. To overcome this challenge, we introduce, for the first time, a multi-dimensional reconfigurable hybrid diffractive ONN system (MDR-HDONN), a physically composable architecture that unlocks a new degree of freedom and unprecedented versatility in DONNs. By leveraging full-system learnability, MDR-HDONN repurposes fixed fabricated optical hardware, achieving exponentially expanded functionality and superior task adaptability through the differentiable learning of system variables. Furthermore, MDR-HDONN adopts a hybrid optical/photonic design, combining the reconfigurability of integrated photonics with the ultra-parallelism of free-space diffractive systems. Extensive evaluations demonstrate that MDR-HDONN has digital-comparable accuracy on various task adaptations with 74x faster speed and 194x lower energy. Compared to prior DONNs, MDR-HDONN shows exponentially larger functional space with 5x faster training speed, paving the way for a new paradigm of versatile, composable, hybrid optical/photonic AI computing. We will open-source our codes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05748v1-abstract-full').style.display = 'none'; document.getElementById('2411.05748v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03527">arXiv:2411.03527</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03527">pdf</a>, <a href="https://arxiv.org/format/2411.03527">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> PACE: Pacing Operator Learning to Accurate Optical Field Simulation for Complicated Photonic Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+H">Hanqing Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Cong%2C+W">Wenyan Cong</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guojin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ning%2C+S">Shupeng Ning</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+R+T">Ray T. Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiaqi Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+D+Z">David Z. Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03527v1-abstract-short" style="display: inline;"> Electromagnetic field simulation is central to designing, optimizing, and validating photonic devices and circuits. However, costly computation associated with numerical simulation poses a significant bottleneck, hindering scalability and turnaround time in the photonic circuit design process. Neural operators offer a promising alternative, but existing SOTA approaches, NeurOLight, struggle with p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03527v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03527v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03527v1-abstract-full" style="display: none;"> Electromagnetic field simulation is central to designing, optimizing, and validating photonic devices and circuits. However, costly computation associated with numerical simulation poses a significant bottleneck, hindering scalability and turnaround time in the photonic circuit design process. Neural operators offer a promising alternative, but existing SOTA approaches, NeurOLight, struggle with predicting high-fidelity fields for real-world complicated photonic devices, with the best reported 0.38 normalized mean absolute error in NeurOLight. The inter-plays of highly complex light-matter interaction, e.g., scattering and resonance, sensitivity to local structure details, non-uniform learning complexity for full-domain simulation, and rich frequency information, contribute to the failure of existing neural PDE solvers. In this work, we boost the prediction fidelity to an unprecedented level for simulating complex photonic devices with a novel operator design driven by the above challenges. We propose a novel cross-axis factorized PACE operator with a strong long-distance modeling capacity to connect the full-domain complex field pattern with local device structures. Inspired by human learning, we further divide and conquer the simulation task for extremely hard cases into two progressively easy tasks, with a first-stage model learning an initial solution refined by a second model. On various complicated photonic device benchmarks, we demonstrate one sole PACE model is capable of achieving 73% lower error with 50% fewer parameters compared with various recent ML for PDE solvers. The two-stage setup further advances high-fidelity simulation for even more intricate cases. In terms of runtime, PACE demonstrates 154-577x and 11.8-12x simulation speedup over numerical solver using scipy or highly-optimized pardiso solver, respectively. We open sourced the code and dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03527v1-abstract-full').style.display = 'none'; document.getElementById('2411.03527v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepeted by Neurips 2024, 21 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02437">arXiv:2411.02437</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02437">pdf</a>, <a href="https://arxiv.org/format/2411.02437">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TypeScore: A Text Fidelity Metric for Text-to-Image Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sampaio%2C+G+G">Georgia Gabriela Sampaio</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruixiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+S">Shuangfei Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiatao Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Susskind%2C+J">Josh Susskind</a>, <a href="/search/cs?searchtype=author&amp;query=Jaitly%2C+N">Navdeep Jaitly</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yizhe Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02437v1-abstract-short" style="display: inline;"> Evaluating text-to-image generative models remains a challenge, despite the remarkable progress being made in their overall performances. While existing metrics like CLIPScore work for coarse evaluations, they lack the sensitivity to distinguish finer differences as model performance rapidly improves. In this work, we focus on the text rendering aspect of these models, which provides a lens for ev&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02437v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02437v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02437v1-abstract-full" style="display: none;"> Evaluating text-to-image generative models remains a challenge, despite the remarkable progress being made in their overall performances. While existing metrics like CLIPScore work for coarse evaluations, they lack the sensitivity to distinguish finer differences as model performance rapidly improves. In this work, we focus on the text rendering aspect of these models, which provides a lens for evaluating a generative model&#39;s fine-grained instruction-following capabilities. To this end, we introduce a new evaluation framework called TypeScore to sensitively assess a model&#39;s ability to generate images with high-fidelity embedded text by following precise instructions. We argue that this text generation capability serves as a proxy for general instruction-following ability in image synthesis. TypeScore uses an additional image description model and leverages an ensemble dissimilarity measure between the original and extracted text to evaluate the fidelity of the rendered text. Our proposed metric demonstrates greater resolution than CLIPScore to differentiate popular image generation models across a range of instructions with diverse text styles. Our study also evaluates how well these vision-language models (VLMs) adhere to stylistic instructions, disentangling style evaluation from embedded-text fidelity. Through human evaluation studies, we quantitatively meta-evaluate the effectiveness of the metric. Comprehensive analysis is conducted to explore factors such as text length, captioning models, and current progress towards human parity on this task. The framework provides insights into remaining gaps in instruction-following for image generation with embedded text. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02437v1-abstract-full').style.display = 'none'; document.getElementById('2411.02437v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01106">arXiv:2411.01106</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01106">pdf</a>, <a href="https://arxiv.org/format/2411.01106">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LoRA-Contextualizing Adaptation of Large Multimodal Models for Long Document Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yufan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Rossi%2C+R+A">Ryan A. Rossi</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Changyou Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+T">Tong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01106v1-abstract-short" style="display: inline;"> Large multimodal models (LMMs) have recently shown great progress in text-rich image understanding, yet they still struggle with complex, multi-page, visually-rich documents. Traditional methods using document parsers for retrieval-augmented generation suffer from performance and efficiency limitations, while directly presenting all pages to LMMs leads to inefficiencies, especially with lengthy do&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01106v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01106v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01106v1-abstract-full" style="display: none;"> Large multimodal models (LMMs) have recently shown great progress in text-rich image understanding, yet they still struggle with complex, multi-page, visually-rich documents. Traditional methods using document parsers for retrieval-augmented generation suffer from performance and efficiency limitations, while directly presenting all pages to LMMs leads to inefficiencies, especially with lengthy documents. In this work, we present a novel framework named LoRA-Contextualizing Adaptation of Large multimodal models (LoCAL), which broadens the capabilities of any LMM to support long-document understanding. We demonstrate that LMMs can effectively serve as multimodal retrievers, fetching relevant pages to answer user questions based on these pages. LoCAL is implemented with two specific LMM adapters: one for evidence page retrieval and another for question answering. Empirical results show state-of-the-art performance on public benchmarks, demonstrating the effectiveness of LoCAL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01106v1-abstract-full').style.display = 'none'; document.getElementById('2411.01106v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Currently Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00883">arXiv:2411.00883</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00883">pdf</a>, <a href="https://arxiv.org/format/2411.00883">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Technical Report for ActivityNet Challenge 2022 -- Temporal Action Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shimin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jianyang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Y">Yandong Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00883v1-abstract-short" style="display: inline;"> In the task of temporal action localization of ActivityNet-1.3 datasets, we propose to locate the temporal boundaries of each action and predict action class in untrimmed videos. We first apply VideoSwinTransformer as feature extractor to extract different features. Then we apply a unified network following Faster-TAD to simultaneously obtain proposals and semantic labels. Last, we ensemble the re&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00883v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00883v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00883v1-abstract-full" style="display: none;"> In the task of temporal action localization of ActivityNet-1.3 datasets, we propose to locate the temporal boundaries of each action and predict action class in untrimmed videos. We first apply VideoSwinTransformer as feature extractor to extract different features. Then we apply a unified network following Faster-TAD to simultaneously obtain proposals and semantic labels. Last, we ensemble the results of different temporal action detection models which complement each other. Faster-TAD simplifies the pipeline of TAD and gets remarkable performance, obtaining comparable results as those of multi-step approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00883v1-abstract-full').style.display = 'none'; document.getElementById('2411.00883v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2204.02674</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00027">arXiv:2411.00027</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00027">pdf</a>, <a href="https://arxiv.org/format/2411.00027">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Personalization of Large Language Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhehao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Rossi%2C+R+A">Ryan A. Rossi</a>, <a href="/search/cs?searchtype=author&amp;query=Kveton%2C+B">Branislav Kveton</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+Y">Yijia Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Diyi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zamani%2C+H">Hamed Zamani</a>, <a href="/search/cs?searchtype=author&amp;query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&amp;query=Barrow%2C+J">Joe Barrow</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Derr%2C+T">Tyler Derr</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hongjie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Junda Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zichao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Mitra%2C+S">Subrata Mitra</a>, <a href="/search/cs?searchtype=author&amp;query=Lipka%2C+N">Nedim Lipka</a>, <a href="/search/cs?searchtype=author&amp;query=Ahmed%2C+N">Nesreen Ahmed</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00027v1-abstract-short" style="display: inline;"> Personalization of Large Language Models (LLMs) has recently become increasingly important with a wide range of applications. Despite the importance and recent progress, most existing works on personalized LLMs have focused either entirely on (a) personalized text generation or (b) leveraging LLMs for personalization-related downstream applications, such as recommendation systems. In this work, we&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00027v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00027v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00027v1-abstract-full" style="display: none;"> Personalization of Large Language Models (LLMs) has recently become increasingly important with a wide range of applications. Despite the importance and recent progress, most existing works on personalized LLMs have focused either entirely on (a) personalized text generation or (b) leveraging LLMs for personalization-related downstream applications, such as recommendation systems. In this work, we bridge the gap between these two separate main directions for the first time by introducing a taxonomy for personalized LLM usage and summarizing the key differences and challenges. We provide a formalization of the foundations of personalized LLMs that consolidates and expands notions of personalization of LLMs, defining and discussing novel facets of personalization, usage, and desiderata of personalized LLMs. We then unify the literature across these diverse fields and usage scenarios by proposing systematic taxonomies for the granularity of personalization, personalization techniques, datasets, evaluation methods, and applications of personalized LLMs. Finally, we highlight challenges and important open problems that remain to be addressed. By unifying and surveying recent research using the proposed taxonomies, we aim to provide a clear guide to the existing literature and different facets of personalization in LLMs, empowering both researchers and practitioners. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00027v1-abstract-full').style.display = 'none'; document.getElementById('2411.00027v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22939">arXiv:2410.22939</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22939">pdf</a>, <a href="https://arxiv.org/format/2410.22939">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AdaptiveISP: Learning an Adaptive Image Signal Processor for Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yujin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+T">Tianfan Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jinwei Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22939v1-abstract-short" style="display: inline;"> Image Signal Processors (ISPs) convert raw sensor signals into digital images, which significantly influence the image quality and the performance of downstream computer vision tasks. Designing ISP pipeline and tuning ISP parameters are two key steps for building an imaging and vision system. To find optimal ISP configurations, recent works use deep neural networks as a proxy to search for ISP par&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22939v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22939v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22939v1-abstract-full" style="display: none;"> Image Signal Processors (ISPs) convert raw sensor signals into digital images, which significantly influence the image quality and the performance of downstream computer vision tasks. Designing ISP pipeline and tuning ISP parameters are two key steps for building an imaging and vision system. To find optimal ISP configurations, recent works use deep neural networks as a proxy to search for ISP parameters or ISP pipelines. However, these methods are primarily designed to maximize the image quality, which are sub-optimal in the performance of high-level computer vision tasks such as detection, recognition, and tracking. Moreover, after training, the learned ISP pipelines are mostly fixed at the inference time, whose performance degrades in dynamic scenes. To jointly optimize ISP structures and parameters, we propose AdaptiveISP, a task-driven and scene-adaptive ISP. One key observation is that for the majority of input images, only a few processing modules are needed to improve the performance of downstream recognition tasks, and only a few inputs require more processing. Based on this, AdaptiveISP utilizes deep reinforcement learning to automatically generate an optimal ISP pipeline and the associated ISP parameters to maximize the detection performance. Experimental results show that AdaptiveISP not only surpasses the prior state-of-the-art methods for object detection but also dynamically manages the trade-off between detection performance and computational cost, especially suitable for scenes with large dynamic range variations. Project website: https://openimaginglab.github.io/AdaptiveISP/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22939v1-abstract-full').style.display = 'none'; document.getElementById('2410.22939v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22916">arXiv:2410.22916</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22916">pdf</a>, <a href="https://arxiv.org/format/2410.22916">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Explainable Behavior Cloning: Teaching Large Language Model Agents through Learning by Demonstration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guan%2C+Y">Yanchu Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Dong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haiqing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+R">Renen Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+C">Chenyi Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jinjie Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+Z">Zhixuan Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22916v1-abstract-short" style="display: inline;"> Autonomous mobile app interaction has become increasingly important with growing complexity of mobile applications. Developing intelligent agents that can effectively navigate and interact with mobile apps remains a significant challenge. In this paper, we propose an Explainable Behavior Cloning LLM Agent (EBC-LLMAgent), a novel approach that combines large language models (LLMs) with behavior clo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22916v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22916v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22916v1-abstract-full" style="display: none;"> Autonomous mobile app interaction has become increasingly important with growing complexity of mobile applications. Developing intelligent agents that can effectively navigate and interact with mobile apps remains a significant challenge. In this paper, we propose an Explainable Behavior Cloning LLM Agent (EBC-LLMAgent), a novel approach that combines large language models (LLMs) with behavior cloning by learning demonstrations to create intelligent and explainable agents for autonomous mobile app interaction. EBC-LLMAgent consists of three core modules: Demonstration Encoding, Code Generation, and UI Mapping, which work synergistically to capture user demonstrations, generate executable codes, and establish accurate correspondence between code and UI elements. We introduce the Behavior Cloning Chain Fusion technique to enhance the generalization capabilities of the agent. Extensive experiments on five popular mobile applications from diverse domains demonstrate the superior performance of EBC-LLMAgent, achieving high success rates in task completion, efficient generalization to unseen scenarios, and the generation of meaningful explanations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22916v1-abstract-full').style.display = 'none'; document.getElementById('2410.22916v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21595">arXiv:2410.21595</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21595">pdf</a>, <a href="https://arxiv.org/format/2410.21595">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deep Trees for (Un)structured Data: Tractability, Performance, and Interpretability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bertsimas%2C+D">Dimitris Bertsimas</a>, <a href="/search/cs?searchtype=author&amp;query=Everest%2C+L">Lisa Everest</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiayi Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Peroni%2C+M">Matthew Peroni</a>, <a href="/search/cs?searchtype=author&amp;query=Stoumpou%2C+V">Vasiliki Stoumpou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21595v1-abstract-short" style="display: inline;"> Decision Trees have remained a popular machine learning method for tabular datasets, mainly due to their interpretability. However, they lack the expressiveness needed to handle highly nonlinear or unstructured datasets. Motivated by recent advances in tree-based machine learning (ML) techniques and first-order optimization methods, we introduce Generalized Soft Trees (GSTs), which extend soft dec&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21595v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21595v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21595v1-abstract-full" style="display: none;"> Decision Trees have remained a popular machine learning method for tabular datasets, mainly due to their interpretability. However, they lack the expressiveness needed to handle highly nonlinear or unstructured datasets. Motivated by recent advances in tree-based machine learning (ML) techniques and first-order optimization methods, we introduce Generalized Soft Trees (GSTs), which extend soft decision trees (STs) and are capable of processing images directly. We demonstrate their advantages with respect to tractability, performance, and interpretability. We develop a tractable approach to growing GSTs, given by the DeepTree algorithm, which, in addition to new regularization terms, produces high-quality models with far fewer nodes and greater interpretability than traditional soft trees. We test the performance of our GSTs on benchmark tabular and image datasets, including MIMIC-IV, MNIST, Fashion MNIST, CIFAR-10 and Celeb-A. We show that our approach outperforms other popular tree methods (CART, Random Forests, XGBoost) in almost all of the datasets, with Convolutional Trees having a significant edge in the hardest CIFAR-10 and Fashion MNIST datasets. Finally, we explore the interpretability of our GSTs and find that even the most complex GSTs are considerably more interpretable than deep neural networks. Overall, our approach of Generalized Soft Trees provides a tractable method that is high-performing on (un)structured datasets and preserves interpretability more than traditional deep learning methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21595v1-abstract-full').style.display = 'none'; document.getElementById('2410.21595v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Machine Learning. Authors are listed in alphabetical order</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20466">arXiv:2410.20466</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20466">pdf</a>, <a href="https://arxiv.org/format/2410.20466">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Guidance Disentanglement Network for Optics-Guided Thermal UAV Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhicheng Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Juanjuan Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenglong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zhongling Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jin Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20466v1-abstract-short" style="display: inline;"> Optics-guided Thermal UAV image Super-Resolution (OTUAV-SR) has attracted significant research interest due to its potential applications in security inspection, agricultural measurement, and object detection. Existing methods often employ single guidance model to generate the guidance features from optical images to assist thermal UAV images super-resolution. However, single guidance models make&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20466v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20466v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20466v1-abstract-full" style="display: none;"> Optics-guided Thermal UAV image Super-Resolution (OTUAV-SR) has attracted significant research interest due to its potential applications in security inspection, agricultural measurement, and object detection. Existing methods often employ single guidance model to generate the guidance features from optical images to assist thermal UAV images super-resolution. However, single guidance models make it difficult to generate effective guidance features under favorable and adverse conditions in UAV scenarios, thus limiting the performance of OTUAV-SR. To address this issue, we propose a novel Guidance Disentanglement network (GDNet), which disentangles the optical image representation according to typical UAV scenario attributes to form guidance features under both favorable and adverse conditions, for robust OTUAV-SR. Moreover, we design an attribute-aware fusion module to combine all attribute-based optical guidance features, which could form a more discriminative representation and fit the attribute-agnostic guidance process. To facilitate OTUAV-SR research in complex UAV scenarios, we introduce VGTSR2.0, a large-scale benchmark dataset containing 3,500 aligned optical-thermal image pairs captured under diverse conditions and scenes. Extensive experiments on VGTSR2.0 demonstrate that GDNet significantly improves OTUAV-SR performance over state-of-the-art methods, especially in the challenging low-light and foggy environments commonly encountered in UAV scenarios. The dataset and code will be publicly available at https://github.com/Jocelyney/GDNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20466v1-abstract-full').style.display = 'none'; document.getElementById('2410.20466v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 19 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20011">arXiv:2410.20011</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20011">pdf</a>, <a href="https://arxiv.org/format/2410.20011">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Small Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Van+Nguyen%2C+C">Chien Van Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+X">Xuan Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Aponte%2C+R">Ryan Aponte</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+Y">Yu Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Basu%2C+S">Samyadeep Basu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Z">Zhengmian Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Parmar%2C+M">Mihir Parmar</a>, <a href="/search/cs?searchtype=author&amp;query=Kunapuli%2C+S">Sasidhar Kunapuli</a>, <a href="/search/cs?searchtype=author&amp;query=Barrow%2C+J">Joe Barrow</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Junda Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+A">Ashish Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&amp;query=Ahmed%2C+N+K">Nesreen K. Ahmed</a>, <a href="/search/cs?searchtype=author&amp;query=Lipka%2C+N">Nedim Lipka</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Deilamsalehy%2C+H">Hanieh Deilamsalehy</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+N">Namyong Park</a>, <a href="/search/cs?searchtype=author&amp;query=Rimer%2C+M">Mike Rimer</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhehao Zhang</a> , et al. (3 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20011v1-abstract-short" style="display: inline;"> Small Language Models (SLMs) have become increasingly important due to their efficiency and performance to perform various language tasks with minimal computational resources, making them ideal for various settings including on-device, mobile, edge devices, among many others. In this article, we present a comprehensive survey on SLMs, focusing on their architectures, training techniques, and model&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20011v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20011v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20011v1-abstract-full" style="display: none;"> Small Language Models (SLMs) have become increasingly important due to their efficiency and performance to perform various language tasks with minimal computational resources, making them ideal for various settings including on-device, mobile, edge devices, among many others. In this article, we present a comprehensive survey on SLMs, focusing on their architectures, training techniques, and model compression techniques. We propose a novel taxonomy for categorizing the methods used to optimize SLMs, including model compression, pruning, and quantization techniques. We summarize the benchmark datasets that are useful for benchmarking SLMs along with the evaluation metrics commonly used. Additionally, we highlight key open challenges that remain to be addressed. Our survey aims to serve as a valuable resource for researchers and practitioners interested in developing and deploying small yet efficient language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20011v1-abstract-full').style.display = 'none'; document.getElementById('2410.20011v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19720">arXiv:2410.19720</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19720">pdf</a>, <a href="https://arxiv.org/format/2410.19720">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> 2D-DPO: Scaling Direct Preference Optimization with 2-Dimensional Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shilong Li</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yancheng He</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Hui Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Bu%2C+X">Xingyuan Bu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Weixun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jihao Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+W">Wenbo Su</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19720v1-abstract-short" style="display: inline;"> Recent advancements in Direct Preference Optimization (DPO) have significantly enhanced the alignment of Large Language Models (LLMs) with human preferences, owing to its simplicity and effectiveness. However, existing methods typically optimize a scalar score or ranking reward, thereby overlooking the multi-dimensional nature of human preferences. In this work, we propose to extend the preference&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19720v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19720v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19720v1-abstract-full" style="display: none;"> Recent advancements in Direct Preference Optimization (DPO) have significantly enhanced the alignment of Large Language Models (LLMs) with human preferences, owing to its simplicity and effectiveness. However, existing methods typically optimize a scalar score or ranking reward, thereby overlooking the multi-dimensional nature of human preferences. In this work, we propose to extend the preference of DPO to two dimensions: segments and aspects. We first introduce a 2D supervision dataset called HelpSteer-2D. For the segment dimension, we divide the response into sentences and assign scores to each segment. For the aspect dimension, we meticulously design several criteria covering the response quality rubrics. With the 2-dimensional signals as feedback, we develop a 2D-DPO framework, decomposing the overall objective into multi-segment and multi-aspect objectives. Extensive experiments on popular benchmarks demonstrate that 2D-DPO performs better than methods that optimize for scalar or 1-dimensional preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19720v1-abstract-full').style.display = 'none'; document.getElementById('2410.19720v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first four authors contributed equally, 25 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18974">arXiv:2410.18974</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18974">pdf</a>, <a href="https://arxiv.org/format/2410.18974">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> 3D-Adapter: Geometry-Consistent Multi-View Diffusion for High-Quality 3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hansheng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+B">Bokui Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yulin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+R">Ruoxi Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+L">Linqi Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+C+Z">Connor Z. Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiayuan Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+H">Hao Su</a>, <a href="/search/cs?searchtype=author&amp;query=Wetzstein%2C+G">Gordon Wetzstein</a>, <a href="/search/cs?searchtype=author&amp;query=Guibas%2C+L">Leonidas Guibas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18974v1-abstract-short" style="display: inline;"> Multi-view image diffusion models have significantly advanced open-domain 3D object generation. However, most existing models rely on 2D network architectures that lack inherent 3D biases, resulting in compromised geometric consistency. To address this challenge, we introduce 3D-Adapter, a plug-in module designed to infuse 3D geometry awareness into pretrained image diffusion models. Central to ou&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18974v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18974v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18974v1-abstract-full" style="display: none;"> Multi-view image diffusion models have significantly advanced open-domain 3D object generation. However, most existing models rely on 2D network architectures that lack inherent 3D biases, resulting in compromised geometric consistency. To address this challenge, we introduce 3D-Adapter, a plug-in module designed to infuse 3D geometry awareness into pretrained image diffusion models. Central to our approach is the idea of 3D feedback augmentation: for each denoising step in the sampling loop, 3D-Adapter decodes intermediate multi-view features into a coherent 3D representation, then re-encodes the rendered RGBD views to augment the pretrained base model through feature addition. We study two variants of 3D-Adapter: a fast feed-forward version based on Gaussian splatting and a versatile training-free version utilizing neural fields and meshes. Our extensive experiments demonstrate that 3D-Adapter not only greatly enhances the geometry quality of text-to-multi-view models such as Instant3D and Zero123++, but also enables high-quality 3D generation using the plain text-to-image Stable Diffusion. Furthermore, we showcase the broad application potential of 3D-Adapter by presenting high quality results in text-to-3D, image-to-3D, text-to-texture, and text-to-avatar tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18974v1-abstract-full').style.display = 'none'; document.getElementById('2410.18974v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://lakonik.github.io/3d-adapter/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18336">arXiv:2410.18336</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18336">pdf</a>, <a href="https://arxiv.org/format/2410.18336">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Assessing the Creativity of LLMs in Proposing Novel Solutions to Mathematical Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Junyi Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jingyi Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+X">Xinyun Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+W">Wenpeng Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guiling Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18336v1-abstract-short" style="display: inline;"> The mathematical capabilities of AI systems are complex and multifaceted. Most existing research has predominantly focused on the correctness of AI-generated solutions to mathematical problems. In this work, we argue that beyond producing correct answers, AI systems should also be capable of, or assist humans in, developing novel solutions to mathematical challenges. This study explores the creati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18336v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18336v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18336v1-abstract-full" style="display: none;"> The mathematical capabilities of AI systems are complex and multifaceted. Most existing research has predominantly focused on the correctness of AI-generated solutions to mathematical problems. In this work, we argue that beyond producing correct answers, AI systems should also be capable of, or assist humans in, developing novel solutions to mathematical challenges. This study explores the creative potential of Large Language Models (LLMs) in mathematical reasoning, an aspect that has received limited attention in prior research. We introduce a novel framework and benchmark, CreativeMath, which encompasses problems ranging from middle school curricula to Olympic-level competitions, designed to assess LLMs&#39; ability to propose innovative solutions after some known solutions have been provided. Our experiments demonstrate that, while LLMs perform well on standard mathematical tasks, their capacity for creative problem-solving varies considerably. Notably, the Gemini-1.5-Pro model outperformed other LLMs in generating novel solutions. This research opens a new frontier in evaluating AI creativity, shedding light on both the strengths and limitations of LLMs in fostering mathematical innovation, and setting the stage for future developments in AI-assisted mathematical discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18336v1-abstract-full').style.display = 'none'; document.getElementById('2410.18336v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18311">arXiv:2410.18311</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18311">pdf</a>, <a href="https://arxiv.org/format/2410.18311">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CoreInfer: Accelerating Large Language Model Inference with Semantics-Inspired Adaptive Sparse Activation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qinsi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Vahidian%2C+S">Saeed Vahidian</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+H">Hancheng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jianyang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yiran Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18311v1-abstract-short" style="display: inline;"> Large language models (LLMs) with billions of parameters have sparked a new wave of exciting AI applications. However, their high computational costs and memory demands during inference pose significant challenges. Adaptive sparse activation inference, which activates only a small number of neurons for each token, offers a novel way to accelerate model inference without degrading performance, show&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18311v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18311v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18311v1-abstract-full" style="display: none;"> Large language models (LLMs) with billions of parameters have sparked a new wave of exciting AI applications. However, their high computational costs and memory demands during inference pose significant challenges. Adaptive sparse activation inference, which activates only a small number of neurons for each token, offers a novel way to accelerate model inference without degrading performance, showing great potential for resource-constrained hardware devices. Nevertheless, existing methods predict activated neurons based on individual tokens with additional MLP, which involve frequent changes in activation maps and resource calls, limiting the acceleration benefits of sparse activation. In this paper, we introduce CoreInfer, an MLP-free adaptive sparse activation inference method based on sentence-level prediction. Specifically, we propose the concept of sentence-wise core neurons, which refers to the subset of neurons most critical for a given sentence, and empirically demonstrate its effectiveness. To determine the core neurons, we explore the correlation between core neurons and the sentence&#39;s semantics. Remarkably, we discovered that core neurons exhibit both stability and similarity in relation to the sentence&#39;s semantics -- an insight overlooked by previous studies. Building on this finding, we further design two semantic-based methods for predicting core neurons to fit different input scenarios. In CoreInfer, the core neurons are determined during the pre-filling stage and fixed during the encoding stage, enabling zero-cost sparse inference. We evaluated the model generalization and task generalization of CoreInfer across various models and tasks. Notably, on an NVIDIA TITAN XP GPU, CoreInfer achieved a 10.33 times and 2.72 times speedup compared to the Huggingface implementation and PowerInfer, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18311v1-abstract-full').style.display = 'none'; document.getElementById('2410.18311v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://wangqinsi1.github.io/coreinfer_page/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17809">arXiv:2410.17809</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.17809">pdf</a>, <a href="https://arxiv.org/format/2410.17809">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Intelligent Agentic System for Complex Image Restoration Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+K">Kaiwen Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jinjin Gu</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+Z">Zhiyuan You</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+C">Chao Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17809v1-abstract-short" style="display: inline;"> Real-world image restoration (IR) is inherently complex and often requires combining multiple specialized models to address diverse degradations. Inspired by human problem-solving, we propose AgenticIR, an agentic system that mimics the human approach to image processing by following five key stages: Perception, Scheduling, Execution, Reflection, and Rescheduling. AgenticIR leverages large languag&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17809v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17809v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17809v1-abstract-full" style="display: none;"> Real-world image restoration (IR) is inherently complex and often requires combining multiple specialized models to address diverse degradations. Inspired by human problem-solving, we propose AgenticIR, an agentic system that mimics the human approach to image processing by following five key stages: Perception, Scheduling, Execution, Reflection, and Rescheduling. AgenticIR leverages large language models (LLMs) and vision-language models (VLMs) that interact via text generation to dynamically operate a toolbox of IR models. We fine-tune VLMs for image quality analysis and employ LLMs for reasoning, guiding the system step by step. To compensate for LLMs&#39; lack of specific IR knowledge and experience, we introduce a self-exploration method, allowing the LLM to observe and summarize restoration results into referenceable documents. Experiments demonstrate AgenticIR&#39;s potential in handling complex IR tasks, representing a promising path toward achieving general intelligence in visual processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17809v1-abstract-full').style.display = 'none'; document.getElementById('2410.17809v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16813">arXiv:2410.16813</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.16813">pdf</a>, <a href="https://arxiv.org/ps/2410.16813">ps</a>, <a href="https://arxiv.org/format/2410.16813">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Klein Model for Hyperbolic Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mao%2C+Y">Yidan Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jing Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Werner%2C+M+C">Marcus C. Werner</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+D">Dongmian Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16813v1-abstract-short" style="display: inline;"> Hyperbolic neural networks (HNNs) have been proved effective in modeling complex data structures. However, previous works mainly focused on the Poincar茅 ball model and the hyperboloid model as coordinate representations of the hyperbolic space, often neglecting the Klein model. Despite this, the Klein model offers its distinct advantages thanks to its straight-line geodesics, which facilitates the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16813v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16813v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16813v1-abstract-full" style="display: none;"> Hyperbolic neural networks (HNNs) have been proved effective in modeling complex data structures. However, previous works mainly focused on the Poincar茅 ball model and the hyperboloid model as coordinate representations of the hyperbolic space, often neglecting the Klein model. Despite this, the Klein model offers its distinct advantages thanks to its straight-line geodesics, which facilitates the well-known Einstein midpoint construction, previously leveraged to accompany HNNs in other models. In this work, we introduce a framework for hyperbolic neural networks based on the Klein model. We provide detailed formulation for representing useful operations using the Klein model. We further study the Klein linear layer and prove that the &#34;tangent space construction&#34; of the scalar multiplication and parallel transport are exactly the Einstein scalar multiplication and the Einstein addition, analogous to the M枚bius operations used in the Poincar茅 ball model. We show numerically that the Klein HNN performs on par with the Poincar茅 ball model, providing a third option for HNN that works as a building block for more complicated architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16813v1-abstract-full').style.display = 'none'; document.getElementById('2410.16813v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024 Symmetry and Geometry in Neural Representations Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16400">arXiv:2410.16400</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.16400">pdf</a>, <a href="https://arxiv.org/format/2410.16400">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VipAct: Visual-Perception Enhancement via Specialized VLM Agent Collaboration and Tool-use </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhehao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Rossi%2C+R">Ryan Rossi</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zichao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lipka%2C+N">Nedim Lipka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16400v1-abstract-short" style="display: inline;"> While vision-language models (VLMs) have demonstrated remarkable performance across various tasks combining textual and visual information, they continue to struggle with fine-grained visual perception tasks that require detailed pixel-level analysis. Effectively eliciting comprehensive reasoning from VLMs on such intricate visual elements remains an open challenge. In this paper, we present VipAc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16400v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16400v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16400v1-abstract-full" style="display: none;"> While vision-language models (VLMs) have demonstrated remarkable performance across various tasks combining textual and visual information, they continue to struggle with fine-grained visual perception tasks that require detailed pixel-level analysis. Effectively eliciting comprehensive reasoning from VLMs on such intricate visual elements remains an open challenge. In this paper, we present VipAct, an agent framework that enhances VLMs by integrating multi-agent collaboration and vision expert models, enabling more precise visual understanding and comprehensive reasoning. VipAct consists of an orchestrator agent, which manages task requirement analysis, planning, and coordination, along with specialized agents that handle specific tasks such as image captioning and vision expert models that provide high-precision perceptual information. This multi-agent approach allows VLMs to better perform fine-grained visual perception tasks by synergizing planning, reasoning, and tool use. We evaluate VipAct on benchmarks featuring a diverse set of visual perception tasks, with experimental results demonstrating significant performance improvements over state-of-the-art baselines across all tasks. Furthermore, comprehensive ablation studies reveal the critical role of multi-agent collaboration in eliciting more detailed System-2 reasoning and highlight the importance of image input for task planning. Additionally, our error analysis identifies patterns of VLMs&#39; inherent limitations in visual perception, providing insights into potential future improvements. VipAct offers a flexible and extensible framework, paving the way for more advanced visual perception systems across various real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16400v1-abstract-full').style.display = 'none'; document.getElementById('2410.16400v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15277">arXiv:2410.15277</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15277">pdf</a>, <a href="https://arxiv.org/format/2410.15277">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BRIEF: Bridging Retrieval and Inference for Multi-hop Reasoning via Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuankai Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jia-Chen Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+N">Nanyun Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15277v1-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) can supplement large language models (LLMs) by integrating external knowledge. However, as the number of retrieved documents increases, the input length to LLMs grows linearly, causing a dramatic increase in latency and a degradation in long-context understanding. This is particularly serious for multi-hop questions that require a chain of reasoning across docu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15277v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15277v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15277v1-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) can supplement large language models (LLMs) by integrating external knowledge. However, as the number of retrieved documents increases, the input length to LLMs grows linearly, causing a dramatic increase in latency and a degradation in long-context understanding. This is particularly serious for multi-hop questions that require a chain of reasoning across documents. To accelerate inference, reduce costs, and minimize distractions, this paper presents BRIEF (Bridging Retrieval and Inference through Evidence Fusion), a lightweight approach that performs query-aware multi-hop reasoning by compressing retrieved documents into highly dense textual summaries to integrate into in-context learning. To enable learning compression for multi-hop reasoning, we curate synthetic data by extracting atomic proposition expressions that encapsulate distinct factoids from the source documents to compose synthetic summaries. Based on our synthetic data built entirely by open-source models, BRIEF generates more concise summaries and enables a range of LLMs to achieve exceptional open-domain question answering (QA) performance. For example, on HotpotQA, BRIEF improves the compression rate by 2 times compared to the state-of-the-art baseline, while outperforming it by 3.00% EM and 4.16% F1 with Flan-UL2 as the reader LM. It also generates more concise summaries than proprietary GPT-3.5, while demonstrating nearly identical QA performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15277v1-abstract-full').style.display = 'none'; document.getElementById('2410.15277v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://jasonforjoy.github.io/BRIEF/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13807">arXiv:2410.13807</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13807">pdf</a>, <a href="https://arxiv.org/format/2410.13807">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ConsisSR: Delving Deep into Consistency in Diffusion-based Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Junhao Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+P">Peng-Tao Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+M">Mi Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jinwei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+W">Wenming Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13807v1-abstract-short" style="display: inline;"> Real-world image super-resolution (Real-ISR) aims at restoring high-quality (HQ) images from low-quality (LQ) inputs corrupted by unknown and complex degradations. In particular, pretrained text-to-image (T2I) diffusion models provide strong generative priors to reconstruct credible and intricate details. However, T2I generation focuses on semantic consistency while Real-ISR emphasizes pixel-level&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13807v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13807v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13807v1-abstract-full" style="display: none;"> Real-world image super-resolution (Real-ISR) aims at restoring high-quality (HQ) images from low-quality (LQ) inputs corrupted by unknown and complex degradations. In particular, pretrained text-to-image (T2I) diffusion models provide strong generative priors to reconstruct credible and intricate details. However, T2I generation focuses on semantic consistency while Real-ISR emphasizes pixel-level reconstruction, which hinders existing methods from fully exploiting diffusion priors. To address this challenge, we introduce ConsisSR to handle both semantic and pixel-level consistency. Specifically, compared to coarse-grained text prompts, we exploit the more powerful CLIP image embedding and effectively leverage both modalities through our Hybrid Prompt Adapter (HPA) for semantic guidance. Secondly, we introduce Time-aware Latent Augmentation (TALA) to mitigate the inherent gap between T2I generation and Real-ISR consistency requirements. By randomly mixing LQ and HQ latent inputs, our model not only handle timestep-specific diffusion noise but also refine the accumulated latent representations. Last but not least, our GAN-Embedding strategy employs the pretrained Real-ESRGAN model to refine the diffusion start point. This accelerates the inference process to 10 steps while preserving sampling quality, in a training-free manner. Our method demonstrates state-of-the-art performance among both full-scale and accelerated models. The code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13807v1-abstract-full').style.display = 'none'; document.getElementById('2410.13807v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12844">arXiv:2410.12844</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.12844">pdf</a>, <a href="https://arxiv.org/format/2410.12844">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TextLap: Customizing Language Models for Text-to-Layout Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yufan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Healey%2C+J">Jennifer Healey</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhiqiang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Changyou Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12844v1-abstract-short" style="display: inline;"> Automatic generation of graphical layouts is crucial for many real-world applications, including designing posters, flyers, advertisements, and graphical user interfaces. Given the incredible ability of Large language models (LLMs) in both natural language understanding and generation, we believe that we could customize an LLM to help people create compelling graphical layouts starting with only t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12844v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12844v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12844v1-abstract-full" style="display: none;"> Automatic generation of graphical layouts is crucial for many real-world applications, including designing posters, flyers, advertisements, and graphical user interfaces. Given the incredible ability of Large language models (LLMs) in both natural language understanding and generation, we believe that we could customize an LLM to help people create compelling graphical layouts starting with only text instructions from the user. We call our method TextLap (text-based layout planning). It uses a curated instruction-based layout planning dataset (InsLap) to customize LLMs as a graphic designer. We demonstrate the effectiveness of TextLap and show that it outperforms strong baselines, including GPT-4 based methods, for image generation and graphical design benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12844v1-abstract-full').style.display = 'none'; document.getElementById('2410.12844v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the EMNLP Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12836">arXiv:2410.12836</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.12836">pdf</a>, <a href="https://arxiv.org/format/2410.12836">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> EditRoom: LLM-parameterized Graph Diffusion for Composable 3D Room Layout Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+K">Kaizhi Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiaotong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xuehai He</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jing Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+K">Kevin Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jianfeng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lijuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X+E">Xin Eric Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12836v1-abstract-short" style="display: inline;"> Given the steep learning curve of professional 3D software and the time-consuming process of managing large 3D assets, language-guided 3D scene editing has significant potential in fields such as virtual reality, augmented reality, and gaming. However, recent approaches to language-guided 3D scene editing either require manual interventions or focus only on appearance modifications without support&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12836v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12836v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12836v1-abstract-full" style="display: none;"> Given the steep learning curve of professional 3D software and the time-consuming process of managing large 3D assets, language-guided 3D scene editing has significant potential in fields such as virtual reality, augmented reality, and gaming. However, recent approaches to language-guided 3D scene editing either require manual interventions or focus only on appearance modifications without supporting comprehensive scene layout changes. In response, we propose Edit-Room, a unified framework capable of executing a variety of layout edits through natural language commands, without requiring manual intervention. Specifically, EditRoom leverages Large Language Models (LLMs) for command planning and generates target scenes using a diffusion-based method, enabling six types of edits: rotate, translate, scale, replace, add, and remove. To address the lack of data for language-guided 3D scene editing, we have developed an automatic pipeline to augment existing 3D scene synthesis datasets and introduced EditRoom-DB, a large-scale dataset with 83k editing pairs, for training and evaluation. Our experiments demonstrate that our approach consistently outperforms other baselines across all metrics, indicating higher accuracy and coherence in language-guided scene layout editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12836v1-abstract-full').style.display = 'none'; document.getElementById('2410.12836v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10818">arXiv:2410.10818</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10818">pdf</a>, <a href="https://arxiv.org/format/2410.10818">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TemporalBench: Benchmarking Fine-grained Temporal Understanding for Multimodal Video Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cai%2C+M">Mu Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+R">Reuben Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianrui Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+B">Bocheng Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+F">Feng Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+F">Fangrui Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jing Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+Y">Yiwu Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Shang%2C+Y">Yuzhang Shang</a>, <a href="/search/cs?searchtype=author&amp;query=Dou%2C+Y">Yao Dou</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+J">Jaden Park</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+Y+J">Yong Jae Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jianwei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10818v2-abstract-short" style="display: inline;"> Understanding fine-grained temporal dynamics is crucial for multimodal video comprehension and generation. Due to the lack of fine-grained temporal annotations, existing video benchmarks mostly resemble static image benchmarks and are incompetent at evaluating models for temporal understanding. In this paper, we introduce TemporalBench, a new benchmark dedicated to evaluating fine-grained temporal&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10818v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10818v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10818v2-abstract-full" style="display: none;"> Understanding fine-grained temporal dynamics is crucial for multimodal video comprehension and generation. Due to the lack of fine-grained temporal annotations, existing video benchmarks mostly resemble static image benchmarks and are incompetent at evaluating models for temporal understanding. In this paper, we introduce TemporalBench, a new benchmark dedicated to evaluating fine-grained temporal understanding in videos. TemporalBench consists of ~10K video question-answer pairs, derived from ~2K high-quality human annotations detailing the temporal dynamics in video clips. As a result, our benchmark provides a unique testbed for evaluating various temporal understanding and reasoning abilities such as action frequency, motion magnitude, event order, etc. Moreover, it enables evaluations on various tasks like both video question answering and captioning, both short and long video understanding, as well as different models such as multimodal video embedding models and text generation models. Results show that state-of-the-art models like GPT-4o achieve only 38.5% question answering accuracy on TemporalBench, demonstrating a significant gap (~30%) between humans and AI in temporal understanding. Furthermore, we notice a critical pitfall for multi-choice QA where LLMs can detect the subtle changes in negative captions and find a centralized description as a cue for its prediction, where we propose Multiple Binary Accuracy (MBA) to correct such bias. We hope that TemporalBench can foster research on improving models&#39; temporal reasoning capabilities. Both dataset and evaluation code will be made available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10818v2-abstract-full').style.display = 'none'; document.getElementById('2410.10818v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://temporalbench.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09140">arXiv:2410.09140</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09140">pdf</a>, <a href="https://arxiv.org/format/2410.09140">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RealEra: Semantic-level Concept Erasure via Neighbor-Concept Mining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yufan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=An%2C+J">Jinyang An</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wanqian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+D">Dayan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jingzi Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zheng Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Weiping Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09140v1-abstract-short" style="display: inline;"> The remarkable development of text-to-image generation models has raised notable security concerns, such as the infringement of portrait rights and the generation of inappropriate content. Concept erasure has been proposed to remove the model&#39;s knowledge about protected and inappropriate concepts. Although many methods have tried to balance the efficacy (erasing target concepts) and specificity (r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09140v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09140v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09140v1-abstract-full" style="display: none;"> The remarkable development of text-to-image generation models has raised notable security concerns, such as the infringement of portrait rights and the generation of inappropriate content. Concept erasure has been proposed to remove the model&#39;s knowledge about protected and inappropriate concepts. Although many methods have tried to balance the efficacy (erasing target concepts) and specificity (retaining irrelevant concepts), they can still generate abundant erasure concepts under the steering of semantically related inputs. In this work, we propose RealEra to address this &#34;concept residue&#34; issue. Specifically, we first introduce the mechanism of neighbor-concept mining, digging out the associated concepts by adding random perturbation into the embedding of erasure concept, thus expanding the erasing range and eliminating the generations even through associated concept inputs. Furthermore, to mitigate the negative impact on the generation of irrelevant concepts caused by the expansion of erasure scope, RealEra preserves the specificity through the beyond-concept regularization. This makes irrelevant concepts maintain their corresponding spatial position, thereby preserving their normal generation performance. We also employ the closed-form solution to optimize weights of U-Net for the cross-attention alignment, as well as the prediction noise alignment with the LoRA module. Extensive experiments on multiple benchmarks demonstrate that RealEra outperforms previous concept erasing methods in terms of superior erasing efficacy, specificity, and generality. More details are available on our project page https://realerasing.github.io/RealEra/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09140v1-abstract-full').style.display = 'none'; document.getElementById('2410.09140v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08182">arXiv:2410.08182</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08182">pdf</a>, <a href="https://arxiv.org/format/2410.08182">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MRAG-Bench: Vision-Centric Evaluation for Retrieval-Augmented Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+W">Wenbo Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jia-Chen Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Dou%2C+Z">Zi-Yi Dou</a>, <a href="/search/cs?searchtype=author&amp;query=Fayyaz%2C+M">Mohsen Fayyaz</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+P">Pan Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+N">Nanyun Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08182v1-abstract-short" style="display: inline;"> Existing multimodal retrieval benchmarks primarily focus on evaluating whether models can retrieve and utilize external textual knowledge for question answering. However, there are scenarios where retrieving visual information is either more beneficial or easier to access than textual data. In this paper, we introduce a multimodal retrieval-augmented generation benchmark, MRAG-Bench, in which we s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08182v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08182v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08182v1-abstract-full" style="display: none;"> Existing multimodal retrieval benchmarks primarily focus on evaluating whether models can retrieve and utilize external textual knowledge for question answering. However, there are scenarios where retrieving visual information is either more beneficial or easier to access than textual data. In this paper, we introduce a multimodal retrieval-augmented generation benchmark, MRAG-Bench, in which we systematically identify and categorize scenarios where visually augmented knowledge is better than textual knowledge, for instance, more images from varying viewpoints. MRAG-Bench consists of 16,130 images and 1,353 human-annotated multiple-choice questions across 9 distinct scenarios. With MRAG-Bench, we conduct an evaluation of 10 open-source and 4 proprietary large vision-language models (LVLMs). Our results show that all LVLMs exhibit greater improvements when augmented with images compared to textual knowledge, confirming that MRAG-Bench is vision-centric. Additionally, we conduct extensive analysis with MRAG-Bench, which offers valuable insights into retrieval-augmented LVLMs. Notably, the top-performing model, GPT-4o, faces challenges in effectively leveraging retrieved knowledge, achieving only a 5.82% improvement with ground-truth information, in contrast to a 33.16% improvement observed in human participants. These findings highlight the importance of MRAG-Bench in encouraging the community to enhance LVLMs&#39; ability to utilize retrieved visual knowledge more effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08182v1-abstract-full').style.display = 'none'; document.getElementById('2410.08182v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://mragbench.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08159">arXiv:2410.08159</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08159">pdf</a>, <a href="https://arxiv.org/format/2410.08159">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DART: Denoising Autoregressive Transformer for Scalable Text-to-Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiatao Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuyang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yizhe Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qihang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Jaitly%2C+N">Navdeep Jaitly</a>, <a href="/search/cs?searchtype=author&amp;query=Susskind%2C+J">Josh Susskind</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+S">Shuangfei Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08159v1-abstract-short" style="display: inline;"> Diffusion models have become the dominant approach for visual generation. They are trained by denoising a Markovian process that gradually adds noise to the input. We argue that the Markovian property limits the models ability to fully utilize the generation trajectory, leading to inefficiencies during training and inference. In this paper, we propose DART, a transformer-based model that unifies a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08159v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08159v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08159v1-abstract-full" style="display: none;"> Diffusion models have become the dominant approach for visual generation. They are trained by denoising a Markovian process that gradually adds noise to the input. We argue that the Markovian property limits the models ability to fully utilize the generation trajectory, leading to inefficiencies during training and inference. In this paper, we propose DART, a transformer-based model that unifies autoregressive (AR) and diffusion within a non-Markovian framework. DART iteratively denoises image patches spatially and spectrally using an AR model with the same architecture as standard language models. DART does not rely on image quantization, enabling more effective image modeling while maintaining flexibility. Furthermore, DART seamlessly trains with both text and image data in a unified model. Our approach demonstrates competitive performance on class-conditioned and text-to-image generation tasks, offering a scalable, efficient alternative to traditional diffusion models. Through this unified framework, DART sets a new benchmark for scalable, high-quality image synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08159v1-abstract-full').style.display = 'none'; document.getElementById('2410.08159v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04810">arXiv:2410.04810</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04810">pdf</a>, <a href="https://arxiv.org/format/2410.04810">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> FedBiP: Heterogeneous One-Shot Federated Learning with Personalized Latent Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Haokun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+G">Gengyuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Bi%2C+J">Jinhe Bi</a>, <a href="/search/cs?searchtype=author&amp;query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jindong Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Krompass%2C+D">Denis Krompass</a>, <a href="/search/cs?searchtype=author&amp;query=Tresp%2C+V">Volker Tresp</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04810v1-abstract-short" style="display: inline;"> One-Shot Federated Learning (OSFL), a special decentralized machine learning paradigm, has recently gained significant attention. OSFL requires only a single round of client data or model upload, which reduces communication costs and mitigates privacy threats compared to traditional FL. Despite these promising prospects, existing methods face challenges due to client data heterogeneity and limited&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04810v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04810v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04810v1-abstract-full" style="display: none;"> One-Shot Federated Learning (OSFL), a special decentralized machine learning paradigm, has recently gained significant attention. OSFL requires only a single round of client data or model upload, which reduces communication costs and mitigates privacy threats compared to traditional FL. Despite these promising prospects, existing methods face challenges due to client data heterogeneity and limited data quantity when applied to real-world OSFL systems. Recently, Latent Diffusion Models (LDM) have shown remarkable advancements in synthesizing high-quality images through pretraining on large-scale datasets, thereby presenting a potential solution to overcome these issues. However, directly applying pretrained LDM to heterogeneous OSFL results in significant distribution shifts in synthetic data, leading to performance degradation in classification models trained on such data. This issue is particularly pronounced in rare domains, such as medical imaging, which are underrepresented in LDM&#39;s pretraining data. To address this challenge, we propose Federated Bi-Level Personalization (FedBiP), which personalizes the pretrained LDM at both instance-level and concept-level. Hereby, FedBiP synthesizes images following the client&#39;s local data distribution without compromising the privacy regulations. FedBiP is also the first approach to simultaneously address feature space heterogeneity and client data scarcity in OSFL. Our method is validated through extensive experiments on three OSFL benchmarks with feature space heterogeneity, as well as on challenging medical and satellite image datasets with label heterogeneity. The results demonstrate the effectiveness of FedBiP, which substantially outperforms other OSFL methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04810v1-abstract-full').style.display = 'none'; document.getElementById('2410.04810v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04161">arXiv:2410.04161</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04161">pdf</a>, <a href="https://arxiv.org/format/2410.04161">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Overcoming False Illusions in Real-World Face Restoration with Multi-Modal Guided Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tao%2C+K">Keda Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jinjin Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiucheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+N">Nan Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04161v1-abstract-short" style="display: inline;"> We introduce a novel Multi-modal Guided Real-World Face Restoration (MGFR) technique designed to improve the quality of facial image restoration from low-quality inputs. Leveraging a blend of attribute text prompts, high-quality reference images, and identity information, MGFR can mitigate the generation of false facial attributes and identities often associated with generative face restoration me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04161v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04161v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04161v1-abstract-full" style="display: none;"> We introduce a novel Multi-modal Guided Real-World Face Restoration (MGFR) technique designed to improve the quality of facial image restoration from low-quality inputs. Leveraging a blend of attribute text prompts, high-quality reference images, and identity information, MGFR can mitigate the generation of false facial attributes and identities often associated with generative face restoration methods. By incorporating a dual-control adapter and a two-stage training strategy, our method effectively utilizes multi-modal prior information for targeted restoration tasks. We also present the Reface-HQ dataset, comprising over 23,000 high-resolution facial images across 5,000 identities, to address the need for reference face training images. Our approach achieves superior visual quality in restoring facial details under severe degradation and allows for controlled restoration processes, enhancing the accuracy of identity preservation and attribute correction. Including negative quality samples and attribute prompts in the training further refines the model&#39;s ability to generate detailed and perceptually accurate images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04161v1-abstract-full').style.display = 'none'; document.getElementById('2410.04161v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 Pages, 28 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01944">arXiv:2410.01944</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01944">pdf</a>, <a href="https://arxiv.org/format/2410.01944">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> One-step Noisy Label Mitigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiayang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+J">Jingkuan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+A">An Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+L">Lianli Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01944v1-abstract-short" style="display: inline;"> Mitigating the detrimental effects of noisy labels on the training process has become increasingly critical, as obtaining entirely clean or human-annotated samples for large-scale pre-training tasks is often impractical. Nonetheless, existing noise mitigation methods often encounter limitations in practical applications due to their task-specific design, model dependency, and significant computati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01944v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01944v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01944v1-abstract-full" style="display: none;"> Mitigating the detrimental effects of noisy labels on the training process has become increasingly critical, as obtaining entirely clean or human-annotated samples for large-scale pre-training tasks is often impractical. Nonetheless, existing noise mitigation methods often encounter limitations in practical applications due to their task-specific design, model dependency, and significant computational overhead. In this work, we exploit the properties of high-dimensional orthogonality to identify a robust and effective boundary in cone space for separating clean and noisy samples. Building on this, we propose One-step Anti-Noise (OSA), a model-agnostic noisy label mitigation paradigm that employs an estimator model and a scoring function to assess the noise level of input pairs through just one-step inference, a cost-efficient process. We empirically demonstrate the superiority of OSA, highlighting its enhanced training robustness, improved task transferability, ease of deployment, and reduced computational costs across various benchmarks, models, and tasks. Our code is released at https://github.com/leolee99/OSA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01944v1-abstract-full').style.display = 'none'; document.getElementById('2410.01944v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 4 figures, 11 Tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01756">arXiv:2410.01756</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01756">pdf</a>, <a href="https://arxiv.org/format/2410.01756">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ImageFolder: Autoregressive Image Generation with Folded Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+K">Kai Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Kuen%2C+J">Jason Kuen</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Raj%2C+B">Bhiksha Raj</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhe Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01756v2-abstract-short" style="display: inline;"> Image tokenizers are crucial for visual generative models, e.g., diffusion models (DMs) and autoregressive (AR) models, as they construct the latent representation for modeling. Increasing token length is a common approach to improve the image reconstruction quality. However, tokenizers with longer token lengths are not guaranteed to achieve better generation quality. There exists a trade-off betw&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01756v2-abstract-full').style.display = 'inline'; document.getElementById('2410.01756v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01756v2-abstract-full" style="display: none;"> Image tokenizers are crucial for visual generative models, e.g., diffusion models (DMs) and autoregressive (AR) models, as they construct the latent representation for modeling. Increasing token length is a common approach to improve the image reconstruction quality. However, tokenizers with longer token lengths are not guaranteed to achieve better generation quality. There exists a trade-off between reconstruction and generation quality regarding token length. In this paper, we investigate the impact of token length on both image reconstruction and generation and provide a flexible solution to the tradeoff. We propose ImageFolder, a semantic tokenizer that provides spatially aligned image tokens that can be folded during autoregressive modeling to improve both generation efficiency and quality. To enhance the representative capability without increasing token length, we leverage dual-branch product quantization to capture different contexts of images. Specifically, semantic regularization is introduced in one branch to encourage compacted semantic information while another branch is designed to capture the remaining pixel-level details. Extensive experiments demonstrate the superior quality of image generation and shorter token length with ImageFolder tokenizer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01756v2-abstract-full').style.display = 'none'; document.getElementById('2410.01756v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/lxa9867/ImageFolder</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01313">arXiv:2410.01313</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01313">pdf</a>, <a href="https://arxiv.org/format/2410.01313">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> ADEPT-Z: Zero-Shot Automated Circuit Topology Search for Pareto-Optimal Photonic Tensor Cores </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Ziyang Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+R">Rena Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01313v1-abstract-short" style="display: inline;"> Photonic tensor cores (PTCs) are essential building blocks for optical artificial intelligence (AI) accelerators based on programmable photonic integrated circuits. Most PTC designs today are manually constructed, with low design efficiency and unsatisfying solution quality. This makes it challenging to meet various hardware specifications and keep up with rapidly evolving AI applications. Prior w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01313v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01313v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01313v1-abstract-full" style="display: none;"> Photonic tensor cores (PTCs) are essential building blocks for optical artificial intelligence (AI) accelerators based on programmable photonic integrated circuits. Most PTC designs today are manually constructed, with low design efficiency and unsatisfying solution quality. This makes it challenging to meet various hardware specifications and keep up with rapidly evolving AI applications. Prior work has explored gradient-based methods to learn a good PTC structure differentiably. However, it suffers from slow training speed and optimization difficulty when handling multiple non-differentiable objectives and constraints. Therefore, in this work, we propose a more flexible and efficient zero-shot multi-objective evolutionary topology search framework ADEPT-Z that explores Pareto-optimal PTC designs with advanced devices in a larger search space. Multiple objectives can be co-optimized while honoring complicated hardware constraints. With only &lt;3 hours of search, we can obtain tens of diverse Pareto-optimal solutions, 100x faster than the prior gradient-based method, outperforming prior manual designs with 2x higher accuracy weighted area-energy efficiency. The code of ADEPT-Z is available at https://github.com/ScopeX-ASU/ADEPT-Z. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01313v1-abstract-full').style.display = 'none'; document.getElementById('2410.01313v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages. Accepted to ACM/IEEE ASP-DAC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01289">arXiv:2410.01289</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01289">pdf</a>, <a href="https://arxiv.org/format/2410.01289">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> The Unlikely Hero: Nonideality in Analog Photonic Neural Networks as Built-in Defender Against Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+H">Haotian Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+Z">Ziang Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Bhoumik%2C+P">Partho Bhoumik</a>, <a href="/search/cs?searchtype=author&amp;query=Banerjee%2C+S">Sanmitra Banerjee</a>, <a href="/search/cs?searchtype=author&amp;query=Chakrabarty%2C+K">Krishnendu Chakrabarty</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01289v1-abstract-short" style="display: inline;"> Electronic-photonic computing systems have emerged as a promising platform for accelerating deep neural network (DNN) workloads. Major efforts have been focused on countering hardware non-idealities and boosting efficiency with various hardware/algorithm co-design methods. However, the adversarial robustness of such photonic analog mixed-signal AI hardware remains unexplored. Though the hardware v&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01289v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01289v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01289v1-abstract-full" style="display: none;"> Electronic-photonic computing systems have emerged as a promising platform for accelerating deep neural network (DNN) workloads. Major efforts have been focused on countering hardware non-idealities and boosting efficiency with various hardware/algorithm co-design methods. However, the adversarial robustness of such photonic analog mixed-signal AI hardware remains unexplored. Though the hardware variations can be mitigated with robustness-driven optimization methods, malicious attacks on the hardware show distinct behaviors from noises, which requires a customized protection method tailored to optical analog hardware. In this work, we rethink the role of conventionally undesired non-idealities in photonic analog accelerators and claim their surprising effects on defending against adversarial weight attacks. Inspired by the protection effects from DNN quantization and pruning, we propose a synergistic defense framework tailored for optical analog hardware that proactively protects sensitive weights via pre-attack unary weight encoding and post-attack vulnerability-aware weight locking. Efficiency-reliability trade-offs are formulated as constrained optimization problems and efficiently solved offline without model re-training costs. Extensive evaluation of various DNN benchmarks with a multi-core photonic accelerator shows that our framework maintains near-ideal on-chip inference accuracy under adversarial bit-flip attacks with merely &lt;3% memory overhead. Our codes are open-sourced at https://github.com/ScopeX-ASU/Unlikely_Hero. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01289v1-abstract-full').style.display = 'none'; document.getElementById('2410.01289v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages. Accepted to ACM/IEEE ASP-DAC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01260">arXiv:2410.01260</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01260">pdf</a>, <a href="https://arxiv.org/format/2410.01260">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Automated Curvy Waveguide Routing for Large-Scale Photonic Integrated Circuits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Hongjian Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+K">Keren Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01260v1-abstract-short" style="display: inline;"> As photonic integrated circuit (PIC) designs advance and grow in complexity, largely driven by innovations in photonic computing and interconnects, traditional manual physical design processes have become increasingly cumbersome. Available PIC layout automation tools are mostly schematic-driven, which has not alleviated the burden of manual waveguide planning and layout drawing for engineers. Prev&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01260v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01260v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01260v1-abstract-full" style="display: none;"> As photonic integrated circuit (PIC) designs advance and grow in complexity, largely driven by innovations in photonic computing and interconnects, traditional manual physical design processes have become increasingly cumbersome. Available PIC layout automation tools are mostly schematic-driven, which has not alleviated the burden of manual waveguide planning and layout drawing for engineers. Previous research in automated PIC routing largely relies on off-the-shelf algorithms designed for electrical circuits, which only support high-level route planning to minimize waveguide crossings. It is not customized to handle unique photonics-specific routing constraints and metrics, such as curvy waveguides, bending, port alignment, and insertion loss. These approaches struggle with large-scale PICs and cannot produce real layout geometries without design-rule violations (DRVs). This highlights the pressing need for electronic-photonic design automation (EPDA) tools that can streamline the physical design of modern PICs. In this paper, for the first time, we propose an open-source automated PIC detailed routing tool, dubbed APR, to generate DRV-free PIC layout for large-scale real-world PICs. APR features a grid-based curvy-aware A* engine with adaptive crossing insertion, congestion-aware net ordering and objective, and crossing-waveguide optimization scheme, all tailored to the unique property of PIC. On large-scale real-world photonic computing cores and interconnects, APR generates a DRV-free layout with 14% lower insertion loss and 6.25x speedup than prior methods, paving the way for future advancements in the EPDA toolchain. Our codes are open-sourced at https://github.com/ScopeX-ASU/APR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01260v1-abstract-full').style.display = 'none'; document.getElementById('2410.01260v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19339">arXiv:2409.19339</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.19339">pdf</a>, <a href="https://arxiv.org/format/2409.19339">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Visual Question Decomposition on Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jianzhe Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+Z">Zhen Han</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shuo Chen</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+B">Bailan He</a>, <a href="/search/cs?searchtype=author&amp;query=Tresp%2C+V">Volker Tresp</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhiqiang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jindong Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19339v2-abstract-short" style="display: inline;"> Question decomposition has emerged as an effective strategy for prompting Large Language Models (LLMs) to answer complex questions. However, while existing methods primarily focus on unimodal language models, the question decomposition capability of Multimodal Large Language Models (MLLMs) has yet to be explored. To this end, this paper explores visual question decomposition on MLLMs. Specifically&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19339v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19339v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19339v2-abstract-full" style="display: none;"> Question decomposition has emerged as an effective strategy for prompting Large Language Models (LLMs) to answer complex questions. However, while existing methods primarily focus on unimodal language models, the question decomposition capability of Multimodal Large Language Models (MLLMs) has yet to be explored. To this end, this paper explores visual question decomposition on MLLMs. Specifically, we introduce a systematic evaluation framework including a dataset and several evaluation criteria to assess the quality of the decomposed sub-questions, revealing that existing MLLMs struggle to produce high-quality sub-questions. To address this limitation, we propose a specific finetuning dataset, DecoVQA+, for enhancing the model&#39;s question decomposition capability. Aiming at enabling models to perform appropriate selective decomposition, we propose an efficient finetuning pipeline. The finetuning pipeline consists of our proposed dataset and a training objective for selective decomposition. Finetuned MLLMs demonstrate significant improvements in the quality of sub-questions and the policy of selective question decomposition. Additionally, the models also achieve higher accuracy with selective decomposition on VQA benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19339v2-abstract-full').style.display = 'none'; document.getElementById('2409.19339v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19149">arXiv:2409.19149</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.19149">pdf</a>, <a href="https://arxiv.org/format/2409.19149">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Pragmatic Jailbreak on Text-to-image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+Z">Zhixin Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+G">Gengyuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&amp;query=Demberg%2C+V">Vera Demberg</a>, <a href="/search/cs?searchtype=author&amp;query=Tresp%2C+V">Volker Tresp</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jindong Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19149v1-abstract-short" style="display: inline;"> Diffusion models have recently achieved remarkable advancements in terms of image quality and fidelity to textual prompts. Concurrently, the safety of such generative models has become an area of growing concern. This work introduces a novel type of jailbreak, which triggers T2I models to generate the image with visual text, where the image and the text, although considered to be safe in isolation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19149v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19149v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19149v1-abstract-full" style="display: none;"> Diffusion models have recently achieved remarkable advancements in terms of image quality and fidelity to textual prompts. Concurrently, the safety of such generative models has become an area of growing concern. This work introduces a novel type of jailbreak, which triggers T2I models to generate the image with visual text, where the image and the text, although considered to be safe in isolation, combine to form unsafe content. To systematically explore this phenomenon, we propose a dataset to evaluate the current diffusion-based text-to-image (T2I) models under such jailbreak. We benchmark nine representative T2I models, including two close-source commercial models. Experimental results reveal a concerning tendency to produce unsafe content: all tested models suffer from such type of jailbreak, with rates of unsafe generation ranging from 8\% to 74\%. In real-world scenarios, various filters such as keyword blocklists, customized prompt filters, and NSFW image filters, are commonly employed to mitigate these risks. We evaluate the effectiveness of such filters against our jailbreak and found that, while current classifiers may be effective for single modality detection, they fail to work against our jailbreak. Our work provides a foundation for further development towards more secure and reliable T2I models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19149v1-abstract-full').style.display = 'none'; document.getElementById('2409.19149v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18783">arXiv:2409.18783</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.18783">pdf</a>, <a href="https://arxiv.org/format/2409.18783">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DualDn: Dual-domain Denoising via Differentiable ISP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Ruikang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yujin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shiqi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jinwei Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+T">Tianfan Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18783v2-abstract-short" style="display: inline;"> Image denoising is a critical component in a camera&#39;s Image Signal Processing (ISP) pipeline. There are two typical ways to inject a denoiser into the ISP pipeline: applying a denoiser directly to captured raw frames (raw domain) or to the ISP&#39;s output sRGB images (sRGB domain). However, both approaches have their limitations. Residual noise from raw-domain denoising can be amplified by the subseq&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18783v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18783v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18783v2-abstract-full" style="display: none;"> Image denoising is a critical component in a camera&#39;s Image Signal Processing (ISP) pipeline. There are two typical ways to inject a denoiser into the ISP pipeline: applying a denoiser directly to captured raw frames (raw domain) or to the ISP&#39;s output sRGB images (sRGB domain). However, both approaches have their limitations. Residual noise from raw-domain denoising can be amplified by the subsequent ISP processing, and the sRGB domain struggles to handle spatially varying noise since it only sees noise distorted by the ISP. Consequently, most raw or sRGB domain denoising works only for specific noise distributions and ISP configurations. To address these challenges, we propose DualDn, a novel learning-based dual-domain denoising. Unlike previous single-domain denoising, DualDn consists of two denoising networks: one in the raw domain and one in the sRGB domain. The raw domain denoising adapts to sensor-specific noise as well as spatially varying noise levels, while the sRGB domain denoising adapts to ISP variations and removes residual noise amplified by the ISP. Both denoising networks are connected with a differentiable ISP, which is trained end-to-end and discarded during the inference stage. With this design, DualDn achieves greater generalizability compared to most learning-based denoising methods, as it can adapt to different unseen noises, ISP parameters, and even novel ISP pipelines. Experiments show that DualDn achieves state-of-the-art performance and can adapt to different denoising architectures. Moreover, DualDn can be used as a plug-and-play denoising module with real cameras without retraining, and still demonstrate better performance than commercial on-camera denoising. The project website is available at: https://openimaginglab.github.io/DualDn/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18783v2-abstract-full').style.display = 'none'; document.getElementById('2409.18783v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ECCV 2024, Project page: https://openimaginglab.github.io/DualDn/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17996">arXiv:2409.17996</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.17996">pdf</a>, <a href="https://arxiv.org/format/2409.17996">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cai%2C+X">Xin Cai</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+Z">Zhiyuan You</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hailong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wentao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jinwei Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+T">Tianfan Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17996v2-abstract-short" style="display: inline;"> Lensless cameras offer significant advantages in size, weight, and cost compared to traditional lens-based systems. Without a focusing lens, lensless cameras rely on computational algorithms to recover the scenes from multiplexed measurements. However, current algorithms struggle with inaccurate forward imaging models and insufficient priors to reconstruct high-quality images. To overcome these li&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17996v2-abstract-full').style.display = 'inline'; document.getElementById('2409.17996v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17996v2-abstract-full" style="display: none;"> Lensless cameras offer significant advantages in size, weight, and cost compared to traditional lens-based systems. Without a focusing lens, lensless cameras rely on computational algorithms to recover the scenes from multiplexed measurements. However, current algorithms struggle with inaccurate forward imaging models and insufficient priors to reconstruct high-quality images. To overcome these limitations, we introduce a novel two-stage approach for consistent and photorealistic lensless image reconstruction. The first stage of our approach ensures data consistency by focusing on accurately reconstructing the low-frequency content with a spatially varying deconvolution method that adjusts to changes in the Point Spread Function (PSF) across the camera&#39;s field of view. The second stage enhances photorealism by incorporating a generative prior from pre-trained diffusion models. By conditioning on the low-frequency content retrieved in the first stage, the diffusion model effectively reconstructs the high-frequency details that are typically lost in the lensless imaging process, while also maintaining image fidelity. Our method achieves a superior balance between data fidelity and visual quality compared to existing methods, as demonstrated with two popular lensless systems, PhlatCam and DiffuserCam. Project website: https://phocolens.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17996v2-abstract-full').style.display = 'none'; document.getElementById('2409.17996v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Spotlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17515">arXiv:2409.17515</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.17515">pdf</a>, <a href="https://arxiv.org/format/2409.17515">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> From News to Forecast: Integrating Event Analysis in LLM-Based Time Series Forecasting with Reflection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinlei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+M">Maike Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+J">Jing Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jinjin Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Junhua Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17515v3-abstract-short" style="display: inline;"> This paper introduces a novel approach that leverages Large Language Models (LLMs) and Generative Agents to enhance time series forecasting by reasoning across both text and time series data. With language as a medium, our method adaptively integrates social events into forecasting models, aligning news content with time series fluctuations to provide richer insights. Specifically, we utilize LLM-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17515v3-abstract-full').style.display = 'inline'; document.getElementById('2409.17515v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17515v3-abstract-full" style="display: none;"> This paper introduces a novel approach that leverages Large Language Models (LLMs) and Generative Agents to enhance time series forecasting by reasoning across both text and time series data. With language as a medium, our method adaptively integrates social events into forecasting models, aligning news content with time series fluctuations to provide richer insights. Specifically, we utilize LLM-based agents to iteratively filter out irrelevant news and employ human-like reasoning to evaluate predictions. This enables the model to analyze complex events, such as unexpected incidents and shifts in social behavior, and continuously refine the selection logic of news and the robustness of the agent&#39;s output. By integrating selected news events with time series data, we fine-tune a pre-trained LLM to predict sequences of digits in time series. The results demonstrate significant improvements in forecasting accuracy, suggesting a potential paradigm shift in time series forecasting through the effective utilization of unstructured news data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17515v3-abstract-full').style.display = 'none'; document.getElementById('2409.17515v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted for NeurIPS 2024. Code and data are available at https://github.com/ameliawong1996/From_News_to_Forecast</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15101">arXiv:2409.15101</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.15101">pdf</a>, <a href="https://arxiv.org/format/2409.15101">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GALD-SE: Guided Anisotropic Lightweight Diffusion for Efficient Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chengzhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jianjun Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+D">Dingding Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Junfeng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Y">Yonghong Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15101v4-abstract-short" style="display: inline;"> Speech enhancement is designed to enhance the intelligibility and quality of speech across diverse noise conditions. Recently, diffusion model has gained lots of attention in speech enhancement area, achieving competitive results. Current diffusion-based methods blur the signal with isotropic Gaussian noise and recover clean speech from the prior. However, these methods often suffer from a substan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15101v4-abstract-full').style.display = 'inline'; document.getElementById('2409.15101v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15101v4-abstract-full" style="display: none;"> Speech enhancement is designed to enhance the intelligibility and quality of speech across diverse noise conditions. Recently, diffusion model has gained lots of attention in speech enhancement area, achieving competitive results. Current diffusion-based methods blur the signal with isotropic Gaussian noise and recover clean speech from the prior. However, these methods often suffer from a substantial computational burden. We argue that the computational inefficiency partially stems from the oversight that speech enhancement is not purely a generative task; it primarily involves noise reduction and completion of missing information, while the clean clues in the original mixture do not need to be regenerated. In this paper, we propose a method that introduces noise with anisotropic guidance during the diffusion process, allowing the neural network to preserve clean clues within noisy recordings. This approach substantially reduces computational complexity while exhibiting robustness against various forms of noise and speech distortion. Experiments demonstrate that the proposed method achieves state-of-the-art results with only approximately 4.5 million parameters, a number significantly lower than that required by other diffusion methods. This effectively narrows the model size disparity between diffusion-based and predictive speech enhancement approaches. Additionally, the proposed method performs well in very noisy scenarios, demonstrating its potential for applications in highly challenging environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15101v4-abstract-full').style.display = 'none'; document.getElementById('2409.15101v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">We make reassessment and update the author list. All authors have approved this version of the manuscript</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13884">arXiv:2409.13884</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13884">pdf</a>, <a href="https://arxiv.org/format/2409.13884">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Multi-LLM Debiasing Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Owens%2C+D+M">Deonna M. Owens</a>, <a href="/search/cs?searchtype=author&amp;query=Rossi%2C+R+A">Ryan A. Rossi</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Deilamsalehy%2C+H">Hanieh Deilamsalehy</a>, <a href="/search/cs?searchtype=author&amp;query=Lipka%2C+N">Nedim Lipka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13884v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are powerful tools with the potential to benefit society immensely, yet, they have demonstrated biases that perpetuate societal inequalities. Despite significant advancements in bias mitigation techniques using data augmentation, zero-shot prompting, and model fine-tuning, biases continuously persist, including subtle biases that may elude human detection. Recent resea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13884v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13884v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13884v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are powerful tools with the potential to benefit society immensely, yet, they have demonstrated biases that perpetuate societal inequalities. Despite significant advancements in bias mitigation techniques using data augmentation, zero-shot prompting, and model fine-tuning, biases continuously persist, including subtle biases that may elude human detection. Recent research has shown a growing interest in multi-LLM approaches, which have been demonstrated to be effective in improving the quality of reasoning and factuality in LLMs. Building on this approach, we propose a novel multi-LLM debiasing framework aimed at reducing bias in LLMs. Our work is the first to introduce and evaluate two distinct approaches within this framework for debiasing LLMs: a centralized method, where the conversation is facilitated by a single central LLM, and a decentralized method, where all models communicate directly. Our findings reveal that our multi-LLM framework significantly reduces bias in LLMs, outperforming the baseline method across several social groups. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13884v1-abstract-full').style.display = 'none'; document.getElementById('2409.13884v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13561">arXiv:2409.13561</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13561">pdf</a>, <a href="https://arxiv.org/format/2409.13561">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Demystifying and Extracting Fault-indicating Information from Logs for Failure Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Junjie Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zhihan Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jinyang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yintong Huo</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiazhen Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhuangbin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+C">Cong Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+H">Hui Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zengyin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+M+R">Michael R. Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13561v1-abstract-short" style="display: inline;"> Logs are imperative in the maintenance of online service systems, which often encompass important information for effective failure mitigation. While existing anomaly detection methodologies facilitate the identification of anomalous logs within extensive runtime data, manual investigation of log messages by engineers remains essential to comprehend faults, which is labor-intensive and error-prone&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13561v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13561v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13561v1-abstract-full" style="display: none;"> Logs are imperative in the maintenance of online service systems, which often encompass important information for effective failure mitigation. While existing anomaly detection methodologies facilitate the identification of anomalous logs within extensive runtime data, manual investigation of log messages by engineers remains essential to comprehend faults, which is labor-intensive and error-prone. Upon examining the log-based troubleshooting practices at CloudA, we find that engineers typically prioritize two categories of log information for diagnosis. These include fault-indicating descriptions, which record abnormal system events, and fault-indicating parameters, which specify the associated entities. Motivated by this finding, we propose an approach to automatically extract such faultindicating information from logs for fault diagnosis, named LoFI. LoFI comprises two key stages. In the first stage, LoFI performs coarse-grained filtering to collect logs related to the faults based on semantic similarity. In the second stage, LoFI leverages a pre-trained language model with a novel prompt-based tuning method to extract fine-grained information of interest from the collected logs. We evaluate LoFI on logs collected from Apache Spark and an industrial dataset from CloudA. The experimental results demonstrate that LoFI outperforms all baseline methods by a significant margin, achieving an absolute improvement of 25.8~37.9 in F1 over the best baseline method, ChatGPT. This highlights the effectiveness of LoFI in recognizing fault-indicating information. Furthermore, the successful deployment of LoFI at CloudA and user studies validate the utility of our method. The code and data are available at https://github.com/Jun-jie-Huang/LoFI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13561v1-abstract-full').style.display = 'none'; document.getElementById('2409.13561v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by the 35th IEEE International Symposium on Software Reliability Engineering (ISSRE&#39;2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13551">arXiv:2409.13551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13551">pdf</a>, <a href="https://arxiv.org/format/2409.13551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3691620.3695503">10.1145/3691620.3695503 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Contextualized Data-Wrangling Code Generation in Computational Notebooks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Junjie Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiazhen Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+S">Shuai Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Inala%2C+J+P">Jeevana Priya Inala</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+C">Cong Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+M+R">Michael R. Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13551v1-abstract-short" style="display: inline;"> Data wrangling, the process of preparing raw data for further analysis in computational notebooks, is a crucial yet time-consuming step in data science. Code generation has the potential to automate the data wrangling process to reduce analysts&#39; overhead by translating user intents into executable code. Precisely generating data wrangling code necessitates a comprehensive consideration of the rich&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13551v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13551v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13551v1-abstract-full" style="display: none;"> Data wrangling, the process of preparing raw data for further analysis in computational notebooks, is a crucial yet time-consuming step in data science. Code generation has the potential to automate the data wrangling process to reduce analysts&#39; overhead by translating user intents into executable code. Precisely generating data wrangling code necessitates a comprehensive consideration of the rich context present in notebooks, including textual context, code context and data context. However, notebooks often interleave multiple non-linear analysis tasks into linear sequence of code blocks, where the contextual dependencies are not clearly reflected. Directly training models with source code blocks fails to fully exploit the contexts for accurate wrangling code generation. To bridge the gap, we aim to construct a high quality datasets with clear and rich contexts to help training models for data wrangling code generation tasks. In this work, we first propose an automated approach, CoCoMine to mine data-wrangling code generation examples with clear multi-modal contextual dependency. It first adopts data flow analysis to identify the code blocks containing data wrangling codes. Then, CoCoMine extracts the contextualized datawrangling code examples through tracing and replaying notebooks. With CoCoMine, we construct CoCoNote, a dataset containing 58,221 examples for Contextualized Data-wrangling Code generation in Notebooks. To demonstrate the effectiveness of our dataset, we finetune a range of pretrained code models and prompt various large language models on our task. Furthermore, we also propose DataCoder, which encodes data context and code&amp;textual contexts separately to enhance code generation. Experiment results demonstrate the significance of incorporating data context in data-wrangling code generation and the effectiveness of our model. We release code and data at url... <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13551v1-abstract-full').style.display = 'none'; document.getElementById('2409.13551v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at ASE 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13174">arXiv:2409.13174</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13174">pdf</a>, <a href="https://arxiv.org/format/2409.13174">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Manipulation Facing Threats: Evaluating Physical Vulnerabilities in End-to-End Vision Language Action Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+E">Erjia Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Chengyuan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Z">Zhao Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jiahang Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiaxu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mengshu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaidi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jindong Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+R">Renjing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13174v2-abstract-short" style="display: inline;"> Recently, driven by advancements in Multimodal Large Language Models (MLLMs), Vision Language Action Models (VLAMs) are being proposed to achieve better performance in open-vocabulary scenarios for robotic manipulation tasks. Since manipulation tasks involve direct interaction with the physical world, ensuring robustness and safety during the execution of this task is always a very critical issue.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13174v2-abstract-full').style.display = 'inline'; document.getElementById('2409.13174v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13174v2-abstract-full" style="display: none;"> Recently, driven by advancements in Multimodal Large Language Models (MLLMs), Vision Language Action Models (VLAMs) are being proposed to achieve better performance in open-vocabulary scenarios for robotic manipulation tasks. Since manipulation tasks involve direct interaction with the physical world, ensuring robustness and safety during the execution of this task is always a very critical issue. In this paper, by synthesizing current safety research on MLLMs and the specific application scenarios of the manipulation task in the physical world, we comprehensively evaluate VLAMs in the face of potential physical threats. Specifically, we propose the Physical Vulnerability Evaluating Pipeline (PVEP) that can incorporate as many visual modal physical threats as possible for evaluating the physical robustness of VLAMs. The physical threats in PVEP specifically include Out-of-Distribution, Typography-based Visual Prompts, and Adversarial Patch Attacks. By comparing the performance fluctuations of VLAMs before and after being attacked, we provide generalizable Analyses of how VLAMs respond to different physical security threats. Our project page is in this link: https://chaducheng.github.io/Manipulat-Facing-Threats/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13174v2-abstract-full').style.display = 'none'; document.getElementById('2409.13174v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10176">arXiv:2409.10176</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.10176">pdf</a>, <a href="https://arxiv.org/format/2409.10176">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> TCDformer-based Momentum Transfer Model for Long-term Sports Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiacheng Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xiyuan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+J">Junjie Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+T">Tongtong Feng</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+N">Ning He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10176v1-abstract-short" style="display: inline;"> Accurate sports prediction is a crucial skill for professional coaches, which can assist in developing effective training strategies and scientific competition tactics. Traditional methods often use complex mathematical statistical techniques to boost predictability, but this often is limited by dataset scale and has difficulty handling long-term predictions with variable distributions, notably un&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10176v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10176v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10176v1-abstract-full" style="display: none;"> Accurate sports prediction is a crucial skill for professional coaches, which can assist in developing effective training strategies and scientific competition tactics. Traditional methods often use complex mathematical statistical techniques to boost predictability, but this often is limited by dataset scale and has difficulty handling long-term predictions with variable distributions, notably underperforming when predicting point-set-game multi-level matches. To deal with this challenge, this paper proposes TM2, a TCDformer-based Momentum Transfer Model for long-term sports prediction, which encompasses a momentum encoding module and a prediction module based on momentum transfer. TM2 initially encodes momentum in large-scale unstructured time series using the local linear scaling approximation (LLSA) module. Then it decomposes the reconstructed time series with momentum transfer into trend and seasonal components. The final prediction results are derived from the additive combination of a multilayer perceptron (MLP) for predicting trend components and wavelet attention mechanisms for seasonal components. Comprehensive experimental results show that on the 2023 Wimbledon men&#39;s tournament datasets, TM2 significantly surpasses existing sports prediction models in terms of performance, reducing MSE by 61.64% and MAE by 63.64%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10176v1-abstract-full').style.display = 'none'; document.getElementById('2409.10176v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under reviewing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07762">arXiv:2409.07762</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.07762">pdf</a>, <a href="https://arxiv.org/ps/2409.07762">ps</a>, <a href="https://arxiv.org/format/2409.07762">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring Kolmogorov-Arnold networks for realistic image sharpness assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+S">Shaode Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Ze Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhimu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiacheng Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+B">Bizu Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07762v2-abstract-short" style="display: inline;"> Score prediction is crucial in realistic image sharpness assessment after informative features are collected. Recently, Kolmogorov-Arnold networks (KANs) have been developed and witnessed remarkable success in data fitting. This study presents Taylor series based KAN (TaylorKAN). Then, different KANs are explored on four realistic image databases (BID2011, CID2013, CLIVE, and KonIQ-10k) for score&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07762v2-abstract-full').style.display = 'inline'; document.getElementById('2409.07762v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07762v2-abstract-full" style="display: none;"> Score prediction is crucial in realistic image sharpness assessment after informative features are collected. Recently, Kolmogorov-Arnold networks (KANs) have been developed and witnessed remarkable success in data fitting. This study presents Taylor series based KAN (TaylorKAN). Then, different KANs are explored on four realistic image databases (BID2011, CID2013, CLIVE, and KonIQ-10k) for score prediction by using 15 mid-level features and 2048 high-level features. When setting support vector regression as the baseline, experimental results indicate KANs are generally better or competitive, TaylorKAN is the best on three databases using mid-level feature input, while KANs are inferior on CLIVE when high-level features are used. This is the first study that explores KANs for image quality assessment. It sheds lights on how to select and improve KANs on related tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07762v2-abstract-full').style.display = 'none'; document.getElementById('2409.07762v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05681">arXiv:2409.05681</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.05681">pdf</a>, <a href="https://arxiv.org/format/2409.05681">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SX-Stitch: An Efficient VMS-UNet Based Framework for Intraoperative Scoliosis X-Ray Image Stitching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+H">Heting Gao</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+M">Mingde He</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+J">Jinqian Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jason Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05681v1-abstract-short" style="display: inline;"> In scoliosis surgery, the limited field of view of the C-arm X-ray machine restricts the surgeons&#39; holistic analysis of spinal structures .This paper presents an end-to-end efficient and robust intraoperative X-ray image stitching method for scoliosis surgery,named SX-Stitch. The method is divided into two stages:segmentation and stitching. In the segmentation stage, We propose a medical image seg&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05681v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05681v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05681v1-abstract-full" style="display: none;"> In scoliosis surgery, the limited field of view of the C-arm X-ray machine restricts the surgeons&#39; holistic analysis of spinal structures .This paper presents an end-to-end efficient and robust intraoperative X-ray image stitching method for scoliosis surgery,named SX-Stitch. The method is divided into two stages:segmentation and stitching. In the segmentation stage, We propose a medical image segmentation model named Vision Mamba of Spine-UNet (VMS-UNet), which utilizes the state space Mamba to capture long-distance contextual information while maintaining linear computational complexity, and incorporates the SimAM attention mechanism, significantly improving the segmentation performance.In the stitching stage, we simplify the alignment process between images to the minimization of a registration energy function. The total energy function is then optimized to order unordered images, and a hybrid energy function is introduced to optimize the best seam, effectively eliminating parallax artifacts. On the clinical dataset, Sx-Stitch demonstrates superiority over SOTA schemes both qualitatively and quantitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05681v1-abstract-full').style.display = 'none'; document.getElementById('2409.05681v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04812">arXiv:2409.04812</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.04812">pdf</a>, <a href="https://arxiv.org/format/2409.04812">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Power Line Aerial Image Restoration under dverse Weather: Datasets and Baselines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Sai Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+B">Bin Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bojun Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Fan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiaoxin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xinsong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Juping Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04812v1-abstract-short" style="display: inline;"> Power Line Autonomous Inspection (PLAI) plays a crucial role in the construction of smart grids due to its great advantages of low cost, high efficiency, and safe operation. PLAI is completed by accurately detecting the electrical components and defects in the aerial images captured by Unmanned Aerial Vehicles (UAVs). However, the visible quality of aerial images is inevitably degraded by adverse&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04812v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04812v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04812v1-abstract-full" style="display: none;"> Power Line Autonomous Inspection (PLAI) plays a crucial role in the construction of smart grids due to its great advantages of low cost, high efficiency, and safe operation. PLAI is completed by accurately detecting the electrical components and defects in the aerial images captured by Unmanned Aerial Vehicles (UAVs). However, the visible quality of aerial images is inevitably degraded by adverse weather like haze, rain, or snow, which are found to drastically decrease the detection accuracy in our research. To circumvent this problem, we propose a new task of Power Line Aerial Image Restoration under Adverse Weather (PLAIR-AW), which aims to recover clean and high-quality images from degraded images with bad weather thus improving detection performance for PLAI. In this context, we are the first to release numerous corresponding datasets, namely, HazeCPLID, HazeTTPLA, HazeInsPLAD for power line aerial image dehazing, RainCPLID, RainTTPLA, RainInsPLAD for power line aerial image deraining, SnowCPLID, SnowInsPLAD for power line aerial image desnowing, which are synthesized upon the public power line aerial image datasets of CPLID, TTPLA, InsPLAD following the mathematical models. Meanwhile, we select numerous state-of-the-art methods from image restoration community as the baseline methods for PLAIR-AW. At last, we conduct large-scale empirical experiments to evaluate the performance of baseline methods on the proposed datasets. The proposed datasets and trained models are available at https://github.com/ntuhubin/PLAIR-AW. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04812v1-abstract-full').style.display = 'none'; document.getElementById('2409.04812v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Gu%2C+J&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10