Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 167 results for author: <span class="mathjax">Luo, R</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Luo%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Luo, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Luo%2C+R&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Luo, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Luo%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Luo%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04376">arXiv:2411.04376</a> <span> [<a href="https://arxiv.org/pdf/2411.04376">pdf</a>, <a href="https://arxiv.org/format/2411.04376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Game-Theoretic Defenses for Robust Conformal Prediction Against Adversarial Attacks in Medical Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rui Luo</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+J">Jie Bao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhixin Zhou</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+C">Chuangyin Dang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04376v1-abstract-short" style="display: inline;"> Adversarial attacks pose significant threats to the reliability and safety of deep learning models, especially in critical domains such as medical imaging. This paper introduces a novel framework that integrates conformal prediction with game-theoretic defensive strategies to enhance model robustness against both known and unknown adversarial perturbations. We address three primary research questi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04376v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04376v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04376v1-abstract-full" style="display: none;"> Adversarial attacks pose significant threats to the reliability and safety of deep learning models, especially in critical domains such as medical imaging. This paper introduces a novel framework that integrates conformal prediction with game-theoretic defensive strategies to enhance model robustness against both known and unknown adversarial perturbations. We address three primary research questions: constructing valid and efficient conformal prediction sets under known attacks (RQ1), ensuring coverage under unknown attacks through conservative thresholding (RQ2), and determining optimal defensive strategies within a zero-sum game framework (RQ3). Our methodology involves training specialized defensive models against specific attack types and employing maximum and minimum classifiers to aggregate defenses effectively. Extensive experiments conducted on the MedMNIST datasets, including PathMNIST, OrganAMNIST, and TissueMNIST, demonstrate that our approach maintains high coverage guarantees while minimizing prediction set sizes. The game-theoretic analysis reveals that the optimal defensive strategy often converges to a singular robust model, outperforming uniform and simple strategies across all evaluated datasets. This work advances the state-of-the-art in uncertainty quantification and adversarial robustness, providing a reliable mechanism for deploying deep learning models in adversarial environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04376v1-abstract-full').style.display = 'none'; document.getElementById('2411.04376v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01558">arXiv:2411.01558</a> <span> [<a href="https://arxiv.org/pdf/2411.01558">pdf</a>, <a href="https://arxiv.org/format/2411.01558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Conformal Inference by Particle Filtering under Hidden Markov Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+X">Xiaoyi Su</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhixin Zhou</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rui Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01558v1-abstract-short" style="display: inline;"> Conformal inference is a statistical method used to construct prediction sets for point predictors, providing reliable uncertainty quantification with probability guarantees. This method utilizes historical labeled data to estimate the conformity or nonconformity between predictions and true labels. However, conducting conformal inference for hidden states under hidden Markov models (HMMs) present… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01558v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01558v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01558v1-abstract-full" style="display: none;"> Conformal inference is a statistical method used to construct prediction sets for point predictors, providing reliable uncertainty quantification with probability guarantees. This method utilizes historical labeled data to estimate the conformity or nonconformity between predictions and true labels. However, conducting conformal inference for hidden states under hidden Markov models (HMMs) presents a significant challenge, as the hidden state data is unavailable, resulting in the absence of a true label set to serve as a conformal calibration set. This paper proposes an adaptive conformal inference framework that leverages a particle filtering approach to address this issue. Rather than directly focusing on the unobservable hidden state, we innovatively use weighted particles as an approximation of the actual posterior distribution of the hidden state. Our goal is to produce prediction sets that encompass these particles to achieve a specific aggregate weight sum, referred to as the aggregated coverage level. The proposed framework can adapt online to the time-varying distribution of data and achieve the defined marginal aggregated coverage level in both one-step and multi-step inference over the long term. We verify the effectiveness of this approach through a real-time target localization simulation study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01558v1-abstract-full').style.display = 'none'; document.getElementById('2411.01558v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23907">arXiv:2410.23907</a> <span> [<a href="https://arxiv.org/pdf/2410.23907">pdf</a>, <a href="https://arxiv.org/format/2410.23907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IP-MOT: Instance Prompt Learning for Cross-Domain Multi-Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zikai Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Longze Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23907v1-abstract-short" style="display: inline;"> Multi-Object Tracking (MOT) aims to associate multiple objects across video frames and is a challenging vision task due to inherent complexities in the tracking environment. Most existing approaches train and track within a single domain, resulting in a lack of cross-domain generalizability to data from other domains. While several works have introduced natural language representation to bridge th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23907v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23907v1-abstract-full" style="display: none;"> Multi-Object Tracking (MOT) aims to associate multiple objects across video frames and is a challenging vision task due to inherent complexities in the tracking environment. Most existing approaches train and track within a single domain, resulting in a lack of cross-domain generalizability to data from other domains. While several works have introduced natural language representation to bridge the domain gap in visual tracking, these textual descriptions often provide too high-level a view and fail to distinguish various instances within the same class. In this paper, we address this limitation by developing IP-MOT, an end-to-end transformer model for MOT that operates without concrete textual descriptions. Our approach is underpinned by two key innovations: Firstly, leveraging a pre-trained vision-language model, we obtain instance-level pseudo textual descriptions via prompt-tuning, which are invariant across different tracking scenes; Secondly, we introduce a query-balanced strategy, augmented by knowledge distillation, to further boost the generalization capabilities of our model. Extensive experiments conducted on three widely used MOT benchmarks, including MOT17, MOT20, and DanceTrack, demonstrate that our approach not only achieves competitive performance on same-domain data compared to state-of-the-art models but also significantly improves the performance of query-based trackers by large margins for cross-domain inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23907v1-abstract-full').style.display = 'none'; document.getElementById('2410.23907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09560">arXiv:2410.09560</a> <span> [<a href="https://arxiv.org/pdf/2410.09560">pdf</a>, <a href="https://arxiv.org/format/2410.09560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Scalable Semantic Representation for Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Taolin Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Junwei Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinpeng Wang</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Y">Yaohua Zha</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruisheng Luo</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xiaoxiang Deng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuan Wang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+M">Ming Yue</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jie Jiang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09560v1-abstract-short" style="display: inline;"> With recent advances in large language models (LLMs), there has been emerging numbers of research in developing Semantic IDs based on LLMs to enhance the performance of recommendation systems. However, the dimension of these embeddings needs to match that of the ID embedding in recommendation, which is usually much smaller than the original length. Such dimension compression results in inevitable… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09560v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09560v1-abstract-full" style="display: none;"> With recent advances in large language models (LLMs), there has been emerging numbers of research in developing Semantic IDs based on LLMs to enhance the performance of recommendation systems. However, the dimension of these embeddings needs to match that of the ID embedding in recommendation, which is usually much smaller than the original length. Such dimension compression results in inevitable losses in discriminability and dimension robustness of the LLM embeddings, which motivates us to scale up the semantic representation. In this paper, we propose Mixture-of-Codes, which first constructs multiple independent codebooks for LLM representation in the indexing stage, and then utilizes the Semantic Representation along with a fusion module for the downstream recommendation stage. Extensive analysis and experiments demonstrate that our method achieves superior discriminability and dimension robustness scalability, leading to the best scale-up performance in recommendations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09560v1-abstract-full').style.display = 'none'; document.getElementById('2410.09560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03927">arXiv:2410.03927</a> <span> [<a href="https://arxiv.org/pdf/2410.03927">pdf</a>, <a href="https://arxiv.org/format/2410.03927">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> End-to-End Reaction Field Energy Modeling via Deep Learning based Voxel-to-voxel Transform </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yongxian Wu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiang Zhu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ray Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03927v1-abstract-short" style="display: inline;"> In computational biochemistry and biophysics, understanding the role of electrostatic interactions is crucial for elucidating the structure, dynamics, and function of biomolecules. The Poisson-Boltzmann (PB) equation is a foundational tool for modeling these interactions by describing the electrostatic potential in and around charged molecules. However, solving the PB equation presents significant… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03927v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03927v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03927v1-abstract-full" style="display: none;"> In computational biochemistry and biophysics, understanding the role of electrostatic interactions is crucial for elucidating the structure, dynamics, and function of biomolecules. The Poisson-Boltzmann (PB) equation is a foundational tool for modeling these interactions by describing the electrostatic potential in and around charged molecules. However, solving the PB equation presents significant computational challenges due to the complexity of biomolecular surfaces and the need to account for mobile ions. While traditional numerical methods for solving the PB equation are accurate, they are computationally expensive and scale poorly with increasing system size. To address these challenges, we introduce PBNeF, a novel machine learning approach inspired by recent advancements in neural network-based partial differential equation solvers. Our method formulates the input and boundary electrostatic conditions of the PB equation into a learnable voxel representation, enabling the use of a neural field transformer to predict the PB solution and, subsequently, the reaction field potential energy. Extensive experiments demonstrate that PBNeF achieves over a 100-fold speedup compared to traditional PB solvers, while maintaining accuracy comparable to the Generalized Born (GB) model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03927v1-abstract-full').style.display = 'none'; document.getElementById('2410.03927v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01504">arXiv:2410.01504</a> <span> [<a href="https://arxiv.org/pdf/2410.01504">pdf</a>, <a href="https://arxiv.org/format/2410.01504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PersonaMath: Enhancing Math Reasoning through Persona-Driven Data Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jing Luo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Longze Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Liang Zhu</a>, <a href="/search/cs?searchtype=author&query=Ao%2C+C">Chang Ao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaming Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yukun Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xin Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wen Yang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jiayuan Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengming Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01504v1-abstract-short" style="display: inline;"> While closed-source Large Language Models (LLMs) demonstrate strong mathematical problem-solving abilities, open-source models continue to struggle with such tasks. To bridge this gap, we propose a data augmentation approach and introduce PersonaMathQA, a dataset derived from MATH and GSM8K, on which we train the PersonaMath models. Our approach consists of two stages: the first stage is learning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01504v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01504v1-abstract-full" style="display: none;"> While closed-source Large Language Models (LLMs) demonstrate strong mathematical problem-solving abilities, open-source models continue to struggle with such tasks. To bridge this gap, we propose a data augmentation approach and introduce PersonaMathQA, a dataset derived from MATH and GSM8K, on which we train the PersonaMath models. Our approach consists of two stages: the first stage is learning from Persona Diversification, and the second stage is learning from Reflection. In the first stage, we regenerate detailed chain-of-thought (CoT) solutions as instructions using a closed-source LLM and introduce a novel persona-driven data augmentation technique to enhance the dataset's quantity and diversity. In the second stage, we incorporate reflection to fully leverage more challenging and valuable questions. Evaluation of our PersonaMath models on MATH and GSM8K reveals that the PersonaMath-7B model (based on LLaMA-2-7B) achieves an accuracy of 24.2% on MATH and 68.7% on GSM8K, surpassing all baseline methods and achieving state-of-the-art performance. Notably, our dataset contains only 70.3K data points-merely 17.8% of MetaMathQA and 27% of MathInstruct-yet our model outperforms these baselines, demonstrating the high quality and diversity of our dataset, which enables more efficient model training. We open-source the PersonaMathQA dataset, PersonaMath models, and our code for public usage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01504v1-abstract-full').style.display = 'none'; document.getElementById('2410.01504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18943">arXiv:2409.18943</a> <span> [<a href="https://arxiv.org/pdf/2409.18943">pdf</a>, <a href="https://arxiv.org/format/2409.18943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Ruler: A Model-Agnostic Method to Control Generated Length for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaming Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziqiang Liu</a>, <a href="/search/cs?searchtype=author&query=bai%2C+y">yuelin bai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Longze Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18943v2-abstract-short" style="display: inline;"> The instruction-following ability of large language models enables humans to interact with AI agents in a natural way. However, when required to generate responses of a specific length, large language models often struggle to meet users' needs due to their inherent difficulty in accurately perceiving numerical constraints. To explore the ability of large language models to control the length of ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18943v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18943v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18943v2-abstract-full" style="display: none;"> The instruction-following ability of large language models enables humans to interact with AI agents in a natural way. However, when required to generate responses of a specific length, large language models often struggle to meet users' needs due to their inherent difficulty in accurately perceiving numerical constraints. To explore the ability of large language models to control the length of generated responses, we propose the Target Length Generation Task (TLG) and design two metrics, Precise Match (PM) and Flexible Match (FM) to evaluate the model's performance in adhering to specified response lengths. Furthermore, we introduce a novel, model-agnostic approach called Ruler, which employs Meta Length Tokens (MLTs) to enhance the instruction-following ability of large language models under length-constrained instructions. Specifically, Ruler equips LLMs with the ability to generate responses of a specified length based on length constraints within the instructions. Moreover, Ruler can automatically generate appropriate MLT when length constraints are not explicitly provided, demonstrating excellent versatility and generalization. Comprehensive experiments show the effectiveness of Ruler across different LLMs on Target Length Generation Task, e.g., at All Level 27.97 average gain on PM, 29.57 average gain on FM. In addition, we conduct extensive ablation experiments to further substantiate the efficacy and generalization of Ruler. Our code and data is available at https://github.com/Geaming2002/Ruler. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18943v2-abstract-full').style.display = 'none'; document.getElementById('2409.18943v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14082">arXiv:2409.14082</a> <span> [<a href="https://arxiv.org/pdf/2409.14082">pdf</a>, <a href="https://arxiv.org/format/2409.14082">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PTD-SQL: Partitioning and Targeted Drilling with LLMs in Text-to-SQL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruilin Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binghuai Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zicheng Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14082v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have emerged as powerful tools for Text-to-SQL tasks, exhibiting remarkable reasoning capabilities. Different from tasks such as math word problems and commonsense reasoning, SQL solutions have a relatively fixed pattern. This facilitates the investigation of whether LLMs can benefit from categorical thinking, mirroring how humans acquire knowledge through inductive re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14082v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14082v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14082v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have emerged as powerful tools for Text-to-SQL tasks, exhibiting remarkable reasoning capabilities. Different from tasks such as math word problems and commonsense reasoning, SQL solutions have a relatively fixed pattern. This facilitates the investigation of whether LLMs can benefit from categorical thinking, mirroring how humans acquire knowledge through inductive reasoning based on comparable examples. In this study, we propose that employing query group partitioning allows LLMs to focus on learning the thought processes specific to a single problem type, consequently enhancing their reasoning abilities across diverse difficulty levels and problem categories. Our experiments reveal that multiple advanced LLMs, when equipped with PTD-SQL, can either surpass or match previous state-of-the-art (SOTA) methods on the Spider and BIRD datasets. Intriguingly, models with varying initial performances have exhibited significant improvements, mainly at the boundary of their capabilities after targeted drilling, suggesting a parallel with human progress. Code is available at https://github.com/lrlbbzl/PTD-SQL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14082v1-abstract-full').style.display = 'none'; document.getElementById('2409.14082v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Main Conference. Revised by ARR April and ARR June. 32 pages, 7 figures and 30 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12849">arXiv:2409.12849</a> <span> [<a href="https://arxiv.org/pdf/2409.12849">pdf</a>, <a href="https://arxiv.org/ps/2409.12849">ps</a>, <a href="https://arxiv.org/format/2409.12849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Margin-Maximizing Fine-Grained Ensemble Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jinghui Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renwei Luo</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+F">Feiping Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12849v1-abstract-short" style="display: inline;"> Ensemble learning has achieved remarkable success in machine learning, but its reliance on numerous base learners limits its application in resource-constrained environments. This paper introduces an innovative "Margin-Maximizing Fine-Grained Ensemble Method" that achieves performance surpassing large-scale ensembles by meticulously optimizing a small number of learners and enhancing generalizatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12849v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12849v1-abstract-full" style="display: none;"> Ensemble learning has achieved remarkable success in machine learning, but its reliance on numerous base learners limits its application in resource-constrained environments. This paper introduces an innovative "Margin-Maximizing Fine-Grained Ensemble Method" that achieves performance surpassing large-scale ensembles by meticulously optimizing a small number of learners and enhancing generalization capability. We propose a novel learnable confidence matrix, quantifying each classifier's confidence for each category, precisely capturing category-specific advantages of individual learners. Furthermore, we design a margin-based loss function, constructing a smooth and partially convex objective using the logsumexp technique. This approach improves optimization, eases convergence, and enables adaptive confidence allocation. Finally, we prove that the loss function is Lipschitz continuous, based on which we develop an efficient gradient optimization algorithm that simultaneously maximizes margins and dynamically adjusts learner weights. Extensive experiments demonstrate that our method outperforms traditional random forests using only one-tenth of the base learners and other state-of-the-art ensemble methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12849v1-abstract-full').style.display = 'none'; document.getElementById('2409.12849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11844">arXiv:2409.11844</a> <span> [<a href="https://arxiv.org/pdf/2409.11844">pdf</a>, <a href="https://arxiv.org/format/2409.11844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MEOW: MEMOry Supervised LLM Unlearning Via Inverted Facts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+T">Tianle Gu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kexin Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruilin Luo</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuanqi Yao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+Y">Yan Teng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingchun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11844v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) can memorize sensitive information, raising concerns about potential misuse. LLM Unlearning, a post-hoc approach to remove this information from trained LLMs, offers a promising solution to mitigate these risks. However, previous practices face three key challenges: 1. Utility: successful unlearning often causes catastrophic collapse on unrelated tasks. 2. Efficiency:… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11844v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11844v1-abstract-full" style="display: none;"> Large Language Models (LLMs) can memorize sensitive information, raising concerns about potential misuse. LLM Unlearning, a post-hoc approach to remove this information from trained LLMs, offers a promising solution to mitigate these risks. However, previous practices face three key challenges: 1. Utility: successful unlearning often causes catastrophic collapse on unrelated tasks. 2. Efficiency: many methods either involve adding similarly sized models, which slows down unlearning or inference, or require retain data that are difficult to obtain. 3. Robustness: even effective methods may still leak data via extraction techniques. To address these challenges, we propose MEOW, a simple yet effective gradient descent-based unlearning method. Specifically, we use an offline LLM to generate a set of inverted facts. Then, we design a new metric, MEMO, to quantify memorization in LLMs. Finally, based on the signals provided by MEMO, we select the most appropriate set of inverted facts and finetune the model based on them. We evaluate MEOW on the commonly used unlearn benchmark, ToFU, with Llama2-7B-Chat and Phi-1.5B, and test it on both NLU and NLG tasks. Results demonstrate significant improvement of MEOW in forget quality without substantial loss in model utility. Meanwhile, MEOW does not exhibit significant degradation in NLU or NLG capabilities, and there is even a slight improvement in NLU performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11844v1-abstract-full').style.display = 'none'; document.getElementById('2409.11844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05840">arXiv:2409.05840</a> <span> [<a href="https://arxiv.org/pdf/2409.05840">pdf</a>, <a href="https://arxiv.org/format/2409.05840">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MMEvol: Empowering Multimodal Large Language Models with Evol-Instruct </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haonan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Longze Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Ting-En Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiong Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuchuan Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Minzheng Wang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+P">Pengpeng Zeng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Lianli Gao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H+T">Heng Tao Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xiaobo Xia</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jingkuan Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongbin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05840v3-abstract-short" style="display: inline;"> The development of Multimodal Large Language Models (MLLMs) has seen significant advancements with increasing demands in various fields (e.g., multimodal agents, embodied intelligence). While model-driven approaches attempt to enhance MLLMs capabilities through diverse architectures, the gains have become increasingly marginal. Conversely, data-driven methods, which scale up image-text instruction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05840v3-abstract-full').style.display = 'inline'; document.getElementById('2409.05840v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05840v3-abstract-full" style="display: none;"> The development of Multimodal Large Language Models (MLLMs) has seen significant advancements with increasing demands in various fields (e.g., multimodal agents, embodied intelligence). While model-driven approaches attempt to enhance MLLMs capabilities through diverse architectures, the gains have become increasingly marginal. Conversely, data-driven methods, which scale up image-text instruction data, are more effective but face limited data diversity and complexity challenges. The absence of high-quality data constitutes a significant development barrier for MLLMs. To address the data quality bottleneck, we propose MMEvol, a novel multimodal instruction data evolution framework. This framework iteratively improve data quality through a refined combination of fine-grained perception, cognitive reasoning, and interaction evolution, generating a more complex and diverse image-text instruction dataset that empowers MLLMs with enhanced capabilities. Beginning with an initial set of instructions, SEED-163K, we utilize MMEvol to systematically broaden the diversity of instruction types, extend visual reasoning steps to improve cognitive reasoning abilities, and thoroughly explore fine-grained information within images to enhance visual understanding and robustness. To comprehensively evaluate the effectiveness of our approach, we conduct extensive qualitative analysis and quantitative experiments across 13 vision-language tasks. Compared to baseline models trained with the initial seed data, the results demonstrate that our method achieves an average accuracy improvement of 3.1 percentage points. Furthermore, our approach reaches state-of-the-art (SOTA) performance in nine tasks using significantly less data compared to state-of-the-art models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05840v3-abstract-full').style.display = 'none'; document.getElementById('2409.05840v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00487">arXiv:2409.00487</a> <span> [<a href="https://arxiv.org/pdf/2409.00487">pdf</a>, <a href="https://arxiv.org/format/2409.00487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TrackSSM: A General Motion Predictor by State-Space Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+B">Bin Hu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zelin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00487v2-abstract-short" style="display: inline;"> Temporal motion modeling has always been a key component in multiple object tracking (MOT) which can ensure smooth trajectory movement and provide accurate positional information to enhance association precision. However, current motion models struggle to be both efficient and effective across different application scenarios. To this end, we propose TrackSSM inspired by the recently popular state… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00487v2-abstract-full').style.display = 'inline'; document.getElementById('2409.00487v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00487v2-abstract-full" style="display: none;"> Temporal motion modeling has always been a key component in multiple object tracking (MOT) which can ensure smooth trajectory movement and provide accurate positional information to enhance association precision. However, current motion models struggle to be both efficient and effective across different application scenarios. To this end, we propose TrackSSM inspired by the recently popular state space models (SSM), a unified encoder-decoder motion framework that uses data-dependent state space model to perform temporal motion of trajectories. Specifically, we propose Flow-SSM, a module that utilizes the position and motion information from historical trajectories to guide the temporal state transition of object bounding boxes. Based on Flow-SSM, we design a flow decoder. It is composed of a cascaded motion decoding module employing Flow-SSM, which can use the encoded flow information to complete the temporal position prediction of trajectories. Additionally, we propose a Step-by-Step Linear (S$^2$L) training strategy. By performing linear interpolation between the positions of the object in the previous frame and the current frame, we construct the pseudo labels of step-by-step linear training, ensuring that the trajectory flow information can better guide the object bounding box in completing temporal transitions. TrackSSM utilizes a simple Mamba-Block to build a motion encoder for historical trajectories, forming a temporal motion model with an encoder-decoder structure in conjunction with the flow decoder. TrackSSM is applicable to various tracking scenarios and achieves excellent tracking performance across multiple benchmarks, further extending the potential of SSM-like temporal motion models in multi-object tracking tasks. Code and models are publicly available at \url{https://github.com/Xavier-Lin/TrackSSM}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00487v2-abstract-full').style.display = 'none'; document.getElementById('2409.00487v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13724">arXiv:2408.13724</a> <span> [<a href="https://arxiv.org/pdf/2408.13724">pdf</a>, <a href="https://arxiv.org/format/2408.13724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> PhysPart: Physically Plausible Part Completion for Interactable Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rundong Luo</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+H">Haoran Geng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Congyue Deng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Puhao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zan Wang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+B">Baoxiong Jia</a>, <a href="/search/cs?searchtype=author&query=Guibas%2C+L">Leonidas Guibas</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Siyuan Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13724v2-abstract-short" style="display: inline;"> Interactable objects are ubiquitous in our daily lives. Recent advances in 3D generative models make it possible to automate the modeling of these objects, benefiting a range of applications from 3D printing to the creation of robot simulation environments. However, while significant progress has been made in modeling 3D shapes and appearances, modeling object physics, particularly for interactabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13724v2-abstract-full').style.display = 'inline'; document.getElementById('2408.13724v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13724v2-abstract-full" style="display: none;"> Interactable objects are ubiquitous in our daily lives. Recent advances in 3D generative models make it possible to automate the modeling of these objects, benefiting a range of applications from 3D printing to the creation of robot simulation environments. However, while significant progress has been made in modeling 3D shapes and appearances, modeling object physics, particularly for interactable objects, remains challenging due to the physical constraints imposed by inter-part motions. In this paper, we tackle the problem of physically plausible part completion for interactable objects, aiming to generate 3D parts that not only fit precisely into the object but also allow smooth part motions. To this end, we propose a diffusion-based part generation model that utilizes geometric conditioning through classifier-free guidance and formulates physical constraints as a set of stability and mobility losses to guide the sampling process. Additionally, we demonstrate the generation of dependent parts, paving the way toward sequential part generation for objects with complex part-whole hierarchies. Experimentally, we introduce a new metric for measuring physical plausibility based on motion success rates. Our model outperforms existing baselines over shape and physical metrics, especially those that do not adequately model physical constraints. We also demonstrate our applications in 3D printing, robot manipulation, and sequential part generation, showing our strength in realistic tasks with the demand for high physical plausibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13724v2-abstract-full').style.display = 'none'; document.getElementById('2408.13724v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10939">arXiv:2408.10939</a> <span> [<a href="https://arxiv.org/pdf/2408.10939">pdf</a>, <a href="https://arxiv.org/format/2408.10939">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Conformalized Interval Arithmetic with Symmetric Calibration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rui Luo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhixin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10939v1-abstract-short" style="display: inline;"> Uncertainty quantification is essential in decision-making, especially when joint distributions of random variables are involved. While conformal prediction provides distribution-free prediction sets with valid coverage guarantees, it traditionally focuses on single predictions. This paper introduces novel conformal prediction methods for estimating the sum or average of unknown labels over specif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10939v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10939v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10939v1-abstract-full" style="display: none;"> Uncertainty quantification is essential in decision-making, especially when joint distributions of random variables are involved. While conformal prediction provides distribution-free prediction sets with valid coverage guarantees, it traditionally focuses on single predictions. This paper introduces novel conformal prediction methods for estimating the sum or average of unknown labels over specific index sets. We develop conformal prediction intervals for single target to the prediction interval for sum of multiple targets. Under permutation invariant assumptions, we prove the validity of our proposed method. We also apply our algorithms on class average estimation and path cost prediction tasks, and we show that our method outperforms existing conformalized approaches as well as non-conformal approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10939v1-abstract-full').style.display = 'none'; document.getElementById('2408.10939v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01139">arXiv:2408.01139</a> <span> [<a href="https://arxiv.org/pdf/2408.01139">pdf</a>, <a href="https://arxiv.org/format/2408.01139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interpreting Global Perturbation Robustness of Image Models using Axiomatic Spectral Importance Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">R贸is铆n Luo</a>, <a href="/search/cs?searchtype=author&query=McDermott%2C+J">James McDermott</a>, <a href="/search/cs?searchtype=author&query=O%27Riordan%2C+C">Colm O'Riordan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01139v2-abstract-short" style="display: inline;"> Perturbation robustness evaluates the vulnerabilities of models, arising from a variety of perturbations, such as data corruptions and adversarial attacks. Understanding the mechanisms of perturbation robustness is critical for global interpretability. We present a model-agnostic, global mechanistic interpretability method to interpret the perturbation robustness of image models. This research is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01139v2-abstract-full').style.display = 'inline'; document.getElementById('2408.01139v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01139v2-abstract-full" style="display: none;"> Perturbation robustness evaluates the vulnerabilities of models, arising from a variety of perturbations, such as data corruptions and adversarial attacks. Understanding the mechanisms of perturbation robustness is critical for global interpretability. We present a model-agnostic, global mechanistic interpretability method to interpret the perturbation robustness of image models. This research is motivated by two key aspects. First, previous global interpretability works, in tandem with robustness benchmarks, e.g. mean corruption error (mCE), are not designed to directly interpret the mechanisms of perturbation robustness within image models. Second, we notice that the spectral signal-to-noise ratios (SNR) of perturbed natural images exponentially decay over the frequency. This power-law-like decay implies that: Low-frequency signals are generally more robust than high-frequency signals -- yet high classification accuracy can not be achieved by low-frequency signals alone. By applying Shapley value theory, our method axiomatically quantifies the predictive powers of robust features and non-robust features within an information theory framework. Our method, dubbed as \textbf{I-ASIDE} (\textbf{I}mage \textbf{A}xiomatic \textbf{S}pectral \textbf{I}mportance \textbf{D}ecomposition \textbf{E}xplanation), provides a unique insight into model robustness mechanisms. We conduct extensive experiments over a variety of vision models pre-trained on ImageNet to show that \textbf{I-ASIDE} can not only \textbf{measure} the perturbation robustness but also \textbf{provide interpretations} of its mechanisms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01139v2-abstract-full').style.display = 'none'; document.getElementById('2408.01139v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Transactions on Machine Learning Research (TMLR 2024)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Transactions on Machine Learning Research (TMLR), 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00923">arXiv:2408.00923</a> <span> [<a href="https://arxiv.org/pdf/2408.00923">pdf</a>, <a href="https://arxiv.org/format/2408.00923">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reclaiming Residual Knowledge: A Novel Paradigm to Low-Bit Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">R贸is铆n Luo</a>, <a href="/search/cs?searchtype=author&query=Drimbarean%2C+A">Alexandru Drimbarean</a>, <a href="/search/cs?searchtype=author&query=McDermott%2C+J">James McDermott</a>, <a href="/search/cs?searchtype=author&query=O%27Riordan%2C+C">Colm O'Riordan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00923v1-abstract-short" style="display: inline;"> This paper explores a novel paradigm in low-bit (i.e. 4-bits or lower) quantization, differing from existing state-of-the-art methods, by framing optimal quantization as an architecture search problem within convolutional neural networks (ConvNets). Our framework, dubbed \textbf{CoRa} (Optimal Quantization Residual \textbf{Co}nvolutional Operator Low-\textbf{Ra}nk Adaptation), is motivated by two… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00923v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00923v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00923v1-abstract-full" style="display: none;"> This paper explores a novel paradigm in low-bit (i.e. 4-bits or lower) quantization, differing from existing state-of-the-art methods, by framing optimal quantization as an architecture search problem within convolutional neural networks (ConvNets). Our framework, dubbed \textbf{CoRa} (Optimal Quantization Residual \textbf{Co}nvolutional Operator Low-\textbf{Ra}nk Adaptation), is motivated by two key aspects. Firstly, quantization residual knowledge, i.e. the lost information between floating-point weights and quantized weights, has long been neglected by the research community. Reclaiming the critical residual knowledge, with an infinitesimal extra parameter cost, can reverse performance degradation without training. Secondly, state-of-the-art quantization frameworks search for optimal quantized weights to address the performance degradation. Yet, the vast search spaces in weight optimization pose a challenge for the efficient optimization in large models. For example, state-of-the-art BRECQ necessitates $2 \times 10^4$ iterations to quantize models. Fundamentally differing from existing methods, \textbf{CoRa} searches for the optimal architectures of low-rank adapters, reclaiming critical quantization residual knowledge, within the search spaces smaller compared to the weight spaces, by many orders of magnitude. The low-rank adapters approximate the quantization residual weights, discarded in previous methods. We evaluate our approach over multiple pre-trained ConvNets on ImageNet. \textbf{CoRa} achieves comparable performance against both state-of-the-art quantization-aware training and post-training quantization baselines, in $4$-bit and $3$-bit quantization, by using less than $250$ iterations on a small calibration set with $1600$ images. Thus, \textbf{CoRa} establishes a new state-of-the-art in terms of the optimization efficiency in low-bit quantization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00923v1-abstract-full').style.display = 'none'; document.getElementById('2408.00923v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by The 35th British Machine Vision Conference (BMVC 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21748">arXiv:2407.21748</a> <span> [<a href="https://arxiv.org/pdf/2407.21748">pdf</a>, <a href="https://arxiv.org/format/2407.21748">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Diagnostic Runtime Monitoring with Martingales </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hindy%2C+A">Ali Hindy</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rachel Luo</a>, <a href="/search/cs?searchtype=author&query=Banerjee%2C+S">Somrita Banerjee</a>, <a href="/search/cs?searchtype=author&query=Kuck%2C+J">Jonathan Kuck</a>, <a href="/search/cs?searchtype=author&query=Schmerling%2C+E">Edward Schmerling</a>, <a href="/search/cs?searchtype=author&query=Pavone%2C+M">Marco Pavone</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21748v1-abstract-short" style="display: inline;"> Machine learning systems deployed in safety-critical robotics settings must be robust to distribution shifts. However, system designers must understand the cause of a distribution shift in order to implement the appropriate intervention or mitigation strategy and prevent system failure. In this paper, we present a novel framework for diagnosing distribution shifts in a streaming fashion by deployi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21748v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21748v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21748v1-abstract-full" style="display: none;"> Machine learning systems deployed in safety-critical robotics settings must be robust to distribution shifts. However, system designers must understand the cause of a distribution shift in order to implement the appropriate intervention or mitigation strategy and prevent system failure. In this paper, we present a novel framework for diagnosing distribution shifts in a streaming fashion by deploying multiple stochastic martingales simultaneously. We show that knowledge of the underlying cause of a distribution shift can lead to proper interventions over the lifecycle of a deployed system. Our experimental framework can easily be adapted to different types of distribution shifts, models, and datasets. We find that our method outperforms existing work on diagnosing distribution shifts in terms of speed, accuracy, and flexibility, and validate the efficiency of our model in both simulated and live hardware settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21748v1-abstract-full').style.display = 'none'; document.getElementById('2407.21748v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20906">arXiv:2407.20906</a> <span> [<a href="https://arxiv.org/pdf/2407.20906">pdf</a>, <a href="https://arxiv.org/format/2407.20906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> </div> </div> <p class="title is-5 mathjax"> Automated Review Generation Method Based on Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shican Wu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiao Ma</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Dehui Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lulu Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiangcheng Shi</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xin Chang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiaoyun Lin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ran Luo</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+C">Chunlei Pei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhi-Jian Zhao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+J">Jinlong Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20906v1-abstract-short" style="display: inline;"> Literature research, vital for scientific advancement, is overwhelmed by the vast ocean of available information. Addressing this, we propose an automated review generation method based on Large Language Models (LLMs) to streamline literature processing and reduce cognitive load. In case study on propane dehydrogenation (PDH) catalysts, our method swiftly generated comprehensive reviews from 343 a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20906v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20906v1-abstract-full" style="display: none;"> Literature research, vital for scientific advancement, is overwhelmed by the vast ocean of available information. Addressing this, we propose an automated review generation method based on Large Language Models (LLMs) to streamline literature processing and reduce cognitive load. In case study on propane dehydrogenation (PDH) catalysts, our method swiftly generated comprehensive reviews from 343 articles, averaging seconds per article per LLM account. Extended analysis of 1041 articles provided deep insights into catalysts' composition, structure, and performance. Recognizing LLMs' hallucinations, we employed a multi-layered quality control strategy, ensuring our method's reliability and effective hallucination mitigation. Expert verification confirms the accuracy and citation integrity of generated reviews, demonstrating LLM hallucination risks reduced to below 0.5% with over 95% confidence. Released Windows application enables one-click review generation, aiding researchers in tracking advancements and recommending literature. This approach showcases LLMs' role in enhancing scientific research productivity and sets the stage for further exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20906v1-abstract-full').style.display = 'none'; document.getElementById('2407.20906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 3 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20730">arXiv:2407.20730</a> <span> [<a href="https://arxiv.org/pdf/2407.20730">pdf</a>, <a href="https://arxiv.org/format/2407.20730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Autogenic Language Embedding for Coherent Point Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zikai Song</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Ying Tang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lintao Ma</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Junqing Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y+P">Yi-Ping Phoebe Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20730v1-abstract-short" style="display: inline;"> Point tracking is a challenging task in computer vision, aiming to establish point-wise correspondence across long video sequences. Recent advancements have primarily focused on temporal modeling techniques to improve local feature similarity, often overlooking the valuable semantic consistency inherent in tracked points. In this paper, we introduce a novel approach leveraging language embeddings… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20730v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20730v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20730v1-abstract-full" style="display: none;"> Point tracking is a challenging task in computer vision, aiming to establish point-wise correspondence across long video sequences. Recent advancements have primarily focused on temporal modeling techniques to improve local feature similarity, often overlooking the valuable semantic consistency inherent in tracked points. In this paper, we introduce a novel approach leveraging language embeddings to enhance the coherence of frame-wise visual features related to the same object. Our proposed method, termed autogenic language embedding for visual feature enhancement, strengthens point correspondence in long-term sequences. Unlike existing visual-language schemes, our approach learns text embeddings from visual features through a dedicated mapping network, enabling seamless adaptation to various tracking tasks without explicit text annotations. Additionally, we introduce a consistency decoder that efficiently integrates text tokens into visual features with minimal computational overhead. Through enhanced visual consistency, our approach significantly improves tracking trajectories in lengthy videos with substantial appearance variations. Extensive experiments on widely-used tracking benchmarks demonstrate the superior performance of our method, showcasing notable enhancements compared to trackers relying solely on visual cues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20730v1-abstract-full').style.display = 'none'; document.getElementById('2407.20730v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ACM MM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17377">arXiv:2407.17377</a> <span> [<a href="https://arxiv.org/pdf/2407.17377">pdf</a>, <a href="https://arxiv.org/format/2407.17377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Entropy Reweighted Conformal Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rui Luo</a>, <a href="/search/cs?searchtype=author&query=Colombo%2C+N">Nicolo Colombo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17377v1-abstract-short" style="display: inline;"> Conformal Prediction (CP) is a powerful framework for constructing prediction sets with guaranteed coverage. However, recent studies have shown that integrating confidence calibration with CP can lead to a degradation in efficiency. In this paper, We propose an adaptive approach that considers the classifier's uncertainty and employs entropy-based reweighting to enhance the efficiency of predictio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17377v1-abstract-full').style.display = 'inline'; document.getElementById('2407.17377v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17377v1-abstract-full" style="display: none;"> Conformal Prediction (CP) is a powerful framework for constructing prediction sets with guaranteed coverage. However, recent studies have shown that integrating confidence calibration with CP can lead to a degradation in efficiency. In this paper, We propose an adaptive approach that considers the classifier's uncertainty and employs entropy-based reweighting to enhance the efficiency of prediction sets for conformal classification. Our experimental results demonstrate that this method significantly improves efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17377v1-abstract-full').style.display = 'none'; document.getElementById('2407.17377v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14495">arXiv:2407.14495</a> <span> [<a href="https://arxiv.org/pdf/2407.14495">pdf</a>, <a href="https://arxiv.org/format/2407.14495">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Conformal Thresholded Intervals for Efficient Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rui Luo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhixin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14495v1-abstract-short" style="display: inline;"> This paper introduces Conformal Thresholded Intervals (CTI), a novel conformal regression method that aims to produce the smallest possible prediction set with guaranteed coverage. Unlike existing methods that rely on nested conformal framework and full conditional distribution estimation, CTI estimates the conditional probability density for a new response to fall into each interquantile interval… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14495v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14495v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14495v1-abstract-full" style="display: none;"> This paper introduces Conformal Thresholded Intervals (CTI), a novel conformal regression method that aims to produce the smallest possible prediction set with guaranteed coverage. Unlike existing methods that rely on nested conformal framework and full conditional distribution estimation, CTI estimates the conditional probability density for a new response to fall into each interquantile interval using off-the-shelf multi-output quantile regression. CTI constructs prediction sets by thresholding the estimated conditional interquantile intervals based on their length, which is inversely proportional to the estimated probability density. The threshold is determined using a calibration set to ensure marginal coverage. Experimental results demonstrate that CTI achieves optimal performance across various datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14495v1-abstract-full').style.display = 'none'; document.getElementById('2407.14495v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10814">arXiv:2407.10814</a> <span> [<a href="https://arxiv.org/pdf/2407.10814">pdf</a>, <a href="https://arxiv.org/format/2407.10814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pathology-knowledge Enhanced Multi-instance Prompt Learning for Few-shot Whole Slide Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dingkang Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dan Huang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qinhao Guo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rongkui Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaosong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10814v1-abstract-short" style="display: inline;"> Current multi-instance learning algorithms for pathology image analysis often require a substantial number of Whole Slide Images for effective training but exhibit suboptimal performance in scenarios with limited learning data. In clinical settings, restricted access to pathology slides is inevitable due to patient privacy concerns and the prevalence of rare or emerging diseases. The emergence of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10814v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10814v1-abstract-full" style="display: none;"> Current multi-instance learning algorithms for pathology image analysis often require a substantial number of Whole Slide Images for effective training but exhibit suboptimal performance in scenarios with limited learning data. In clinical settings, restricted access to pathology slides is inevitable due to patient privacy concerns and the prevalence of rare or emerging diseases. The emergence of the Few-shot Weakly Supervised WSI Classification accommodates the significant challenge of the limited slide data and sparse slide-level labels for diagnosis. Prompt learning based on the pre-trained models (\eg, CLIP) appears to be a promising scheme for this setting; however, current research in this area is limited, and existing algorithms often focus solely on patch-level prompts or confine themselves to language prompts. This paper proposes a multi-instance prompt learning framework enhanced with pathology knowledge, \ie, integrating visual and textual prior knowledge into prompts at both patch and slide levels. The training process employs a combination of static and learnable prompts, effectively guiding the activation of pre-trained models and further facilitating the diagnosis of key pathology patterns. Lightweight Messenger (self-attention) and Summary (attention-pooling) layers are introduced to model relationships between patches and slides within the same patient data. Additionally, alignment-wise contrastive losses ensure the feature-level alignment between visual and textual learnable prompts for both patches and slides. Our method demonstrates superior performance in three challenging clinical tasks, significantly outperforming comparative few-shot methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10814v1-abstract-full').style.display = 'none'; document.getElementById('2407.10814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10230">arXiv:2407.10230</a> <span> [<a href="https://arxiv.org/pdf/2407.10230">pdf</a>, <a href="https://arxiv.org/format/2407.10230">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Weighted Aggregation of Conformity Scores for Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rui Luo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhixin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10230v1-abstract-short" style="display: inline;"> Conformal prediction is a powerful framework for constructing prediction sets with valid coverage guarantees in multi-class classification. However, existing methods often rely on a single score function, which can limit their efficiency and informativeness. We propose a novel approach that combines multiple score functions to improve the performance of conformal predictors by identifying optimal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10230v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10230v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10230v1-abstract-full" style="display: none;"> Conformal prediction is a powerful framework for constructing prediction sets with valid coverage guarantees in multi-class classification. However, existing methods often rely on a single score function, which can limit their efficiency and informativeness. We propose a novel approach that combines multiple score functions to improve the performance of conformal predictors by identifying optimal weights that minimize prediction set size. Our theoretical analysis establishes a connection between the weighted score functions and subgraph classes of functions studied in Vapnik-Chervonenkis theory, providing a rigorous mathematical basis for understanding the effectiveness of the proposed method. Experiments demonstrate that our approach consistently outperforms single-score conformal predictors while maintaining valid coverage, offering a principled and data-driven way to enhance the efficiency and practicality of conformal prediction in classification tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10230v1-abstract-full').style.display = 'none'; document.getElementById('2407.10230v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04407">arXiv:2407.04407</a> <span> [<a href="https://arxiv.org/pdf/2407.04407">pdf</a>, <a href="https://arxiv.org/format/2407.04407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Trustworthy Classification through Rank-Based Conformal Prediction Sets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rui Luo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhixin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04407v1-abstract-short" style="display: inline;"> Machine learning classification tasks often benefit from predicting a set of possible labels with confidence scores to capture uncertainty. However, existing methods struggle with the high-dimensional nature of the data and the lack of well-calibrated probabilities from modern classification models. We propose a novel conformal prediction method that employs a rank-based score function suitable fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04407v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04407v1-abstract-full" style="display: none;"> Machine learning classification tasks often benefit from predicting a set of possible labels with confidence scores to capture uncertainty. However, existing methods struggle with the high-dimensional nature of the data and the lack of well-calibrated probabilities from modern classification models. We propose a novel conformal prediction method that employs a rank-based score function suitable for classification models that predict the order of labels correctly, even if not well-calibrated. Our approach constructs prediction sets that achieve the desired coverage rate while managing their size. We provide a theoretical analysis of the expected size of the conformal prediction sets based on the rank distribution of the underlying classifier. Through extensive experiments, we demonstrate that our method outperforms existing techniques on various datasets, providing reliable uncertainty quantification. Our contributions include a novel conformal prediction method, theoretical analysis, and empirical evaluation. This work advances the practical deployment of machine learning systems by enabling reliable uncertainty quantification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04407v1-abstract-full').style.display = 'none'; document.getElementById('2407.04407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18294">arXiv:2406.18294</a> <span> [<a href="https://arxiv.org/pdf/2406.18294">pdf</a>, <a href="https://arxiv.org/format/2406.18294">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Context Pruning: Optimizing Real-World Code Completion with Repository-Level Pretrained Code LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaming Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xiaobo Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaxi Yang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Minzheng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Longze Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junhao Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18294v2-abstract-short" style="display: inline;"> Some recently developed code large language models (Code LLMs) have been pre-trained on repository-level code data (Repo-Code LLMs), enabling these models to recognize repository structures and utilize cross-file information for code completion. However, in real-world development scenarios, simply concatenating the entire code repository often exceeds the context window limits of these Repo-Code L… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18294v2-abstract-full').style.display = 'inline'; document.getElementById('2406.18294v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18294v2-abstract-full" style="display: none;"> Some recently developed code large language models (Code LLMs) have been pre-trained on repository-level code data (Repo-Code LLMs), enabling these models to recognize repository structures and utilize cross-file information for code completion. However, in real-world development scenarios, simply concatenating the entire code repository often exceeds the context window limits of these Repo-Code LLMs, leading to significant performance degradation. In this study, we conducted extensive preliminary experiments and analyses on six Repo-Code LLMs. The results indicate that maintaining the topological dependencies of files and increasing the code file content in the completion prompts can improve completion accuracy; pruning the specific implementations of functions in all dependent files does not significantly reduce the accuracy of completions. Based on these findings, we proposed a strategy named Hierarchical Context Pruning (HCP) to construct completion prompts with high informational code content. The HCP models the code repository at the function level, maintaining the topological dependencies between code files while removing a large amount of irrelevant code content, significantly reduces the input length for repository-level code completion. We applied the HCP strategy in experiments with six Repo-Code LLMs, and the results demonstrate that our proposed method can significantly enhance completion accuracy while substantially reducing the length of input. Our code and data are available at https://github.com/Hambaobao/HCP-Coder. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18294v2-abstract-full').style.display = 'none'; document.getElementById('2406.18294v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17419">arXiv:2406.17419</a> <span> [<a href="https://arxiv.org/pdf/2406.17419">pdf</a>, <a href="https://arxiv.org/format/2406.17419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Leave No Document Behind: Benchmarking Long-Context LLMs with Extended Multi-Doc QA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Minzheng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Longze Chen</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Cheng Fu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+S">Shengyi Liao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinghua Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bingli Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haiyang Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Nan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongbin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17419v2-abstract-short" style="display: inline;"> Long-context modeling capabilities have garnered widespread attention, leading to the emergence of Large Language Models (LLMs) with ultra-context windows. Meanwhile, benchmarks for evaluating long-context LLMs are gradually catching up. However, existing benchmarks employ irrelevant noise texts to artificially extend the length of test cases, diverging from the real-world scenarios of long-contex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17419v2-abstract-full').style.display = 'inline'; document.getElementById('2406.17419v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17419v2-abstract-full" style="display: none;"> Long-context modeling capabilities have garnered widespread attention, leading to the emergence of Large Language Models (LLMs) with ultra-context windows. Meanwhile, benchmarks for evaluating long-context LLMs are gradually catching up. However, existing benchmarks employ irrelevant noise texts to artificially extend the length of test cases, diverging from the real-world scenarios of long-context applications. To bridge this gap, we propose a novel long-context benchmark, Loong, aligning with realistic scenarios through extended multi-document question answering (QA). Unlike typical document QA, in Loong's test cases, each document is relevant to the final answer, ignoring any document will lead to the failure of the answer. Furthermore, Loong introduces four types of tasks with a range of context lengths: Spotlight Locating, Comparison, Clustering, and Chain of Reasoning, to facilitate a more realistic and comprehensive evaluation of long-context understanding. Extensive experiments indicate that existing long-context language models still exhibit considerable potential for enhancement. Retrieval augmented generation (RAG) achieves poor performance, demonstrating that Loong can reliably assess the model's long-context modeling capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17419v2-abstract-full').style.display = 'none'; document.getElementById('2406.17419v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Main. We release our code and data publicly at https://github.com/MozerWang/Loong</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17148">arXiv:2406.17148</a> <span> [<a href="https://arxiv.org/pdf/2406.17148">pdf</a>, <a href="https://arxiv.org/format/2406.17148">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MixTex: Unambiguous Recognition Should Not Rely Solely on Real Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renqing Luo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuhan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17148v2-abstract-short" style="display: inline;"> This paper introduces MixTex, an end-to-end LaTeX OCR model designed for low-bias multilingual recognition, along with its novel data collection method. In applying Transformer architectures to LaTeX text recognition, we identified specific bias issues, such as the frequent misinterpretation of $e-t$ as $e^{-t}$. We attribute this bias to the characteristics of the arXiv dataset commonly used for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17148v2-abstract-full').style.display = 'inline'; document.getElementById('2406.17148v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17148v2-abstract-full" style="display: none;"> This paper introduces MixTex, an end-to-end LaTeX OCR model designed for low-bias multilingual recognition, along with its novel data collection method. In applying Transformer architectures to LaTeX text recognition, we identified specific bias issues, such as the frequent misinterpretation of $e-t$ as $e^{-t}$. We attribute this bias to the characteristics of the arXiv dataset commonly used for training. To mitigate this bias, we propose an innovative data augmentation method. This approach introduces controlled noise into the recognition targets by blending genuine text with pseudo-text and incorporating a small proportion of disruptive characters. We further suggest that this method has broader applicability to various disambiguation recognition tasks, including the accurate identification of erroneous notes in musical performances. MixTex's architecture leverages the Swin Transformer as its encoder and RoBERTa as its decoder. Our experimental results demonstrate that this approach significantly reduces bias in recognition tasks. Notably, when processing clear and unambiguous images, the model adheres strictly to the image rather than over-relying on contextual cues for token prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17148v2-abstract-full').style.display = 'none'; document.getElementById('2406.17148v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08281">arXiv:2406.08281</a> <span> [<a href="https://arxiv.org/pdf/2406.08281">pdf</a>, <a href="https://arxiv.org/format/2406.08281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Conformal Load Prediction with Transductive Graph Autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rui Luo</a>, <a href="/search/cs?searchtype=author&query=Colombo%2C+N">Nicolo Colombo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08281v1-abstract-short" style="display: inline;"> Predicting edge weights on graphs has various applications, from transportation systems to social networks. This paper describes a Graph Neural Network (GNN) approach for edge weight prediction with guaranteed coverage. We leverage conformal prediction to calibrate the GNN outputs and produce valid prediction intervals. We handle data heteroscedasticity through error reweighting and Conformalized… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08281v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08281v1-abstract-full" style="display: none;"> Predicting edge weights on graphs has various applications, from transportation systems to social networks. This paper describes a Graph Neural Network (GNN) approach for edge weight prediction with guaranteed coverage. We leverage conformal prediction to calibrate the GNN outputs and produce valid prediction intervals. We handle data heteroscedasticity through error reweighting and Conformalized Quantile Regression (CQR). We compare the performance of our method against baseline techniques on real-world transportation datasets. Our approach has better coverage and efficiency than all baselines and showcases robustness and adaptability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08281v1-abstract-full').style.display = 'none'; document.getElementById('2406.08281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20648">arXiv:2405.20648</a> <span> [<a href="https://arxiv.org/pdf/2405.20648">pdf</a>, <a href="https://arxiv.org/format/2405.20648">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3689091.3690086">10.1145/3689091.3690086 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Shotluck Holmes: A Family of Efficient Small-Scale Large Language Vision Models For Video Captioning and Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Richard Luo</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+A">Austin Peng</a>, <a href="/search/cs?searchtype=author&query=Vasudev%2C+A">Adithya Vasudev</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+R">Rishabh Jain</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20648v2-abstract-short" style="display: inline;"> Video is an increasingly prominent and information-dense medium, yet it poses substantial challenges for language models. A typical video consists of a sequence of shorter segments, or shots, that collectively form a coherent narrative. Each shot is analogous to a word in a sentence where multiple data streams of information (such as visual and auditory data) must be processed simultaneously. Comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20648v2-abstract-full').style.display = 'inline'; document.getElementById('2405.20648v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20648v2-abstract-full" style="display: none;"> Video is an increasingly prominent and information-dense medium, yet it poses substantial challenges for language models. A typical video consists of a sequence of shorter segments, or shots, that collectively form a coherent narrative. Each shot is analogous to a word in a sentence where multiple data streams of information (such as visual and auditory data) must be processed simultaneously. Comprehension of the entire video requires not only understanding the visual-audio information of each shot but also requires that the model links the ideas between each shot to generate a larger, all-encompassing story. Despite significant progress in the field, current works often overlook videos' more granular shot-by-shot semantic information. In this project, we propose a family of efficient large language vision models (LLVMs) to boost video summarization and captioning called Shotluck Holmes. By leveraging better pretraining and data collection strategies, we extend the abilities of existing small LLVMs from being able to understand a picture to being able to understand a sequence of frames. Specifically, we show that Shotluck Holmes achieves better performance than state-of-the-art results on the Shot2Story video captioning and summary task with significantly smaller and more computationally efficient models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20648v2-abstract-full').style.display = 'none'; document.getElementById('2405.20648v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17915">arXiv:2405.17915</a> <span> [<a href="https://arxiv.org/pdf/2405.17915">pdf</a>, <a href="https://arxiv.org/format/2405.17915">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Long Context is Not Long at All: A Prospector of Long-Dependency Data for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Longze Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziqiang Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wanwei He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17915v1-abstract-short" style="display: inline;"> Long-context modeling capabilities are important for large language models (LLMs) in various applications. However, directly training LLMs with long context windows is insufficient to enhance this capability since some training samples do not exhibit strong semantic dependencies across long contexts. In this study, we propose a data mining framework \textbf{ProLong} that can assign each training s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17915v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17915v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17915v1-abstract-full" style="display: none;"> Long-context modeling capabilities are important for large language models (LLMs) in various applications. However, directly training LLMs with long context windows is insufficient to enhance this capability since some training samples do not exhibit strong semantic dependencies across long contexts. In this study, we propose a data mining framework \textbf{ProLong} that can assign each training sample with a long dependency score, which can be used to rank and filter samples that are more advantageous for enhancing long-context modeling abilities in LLM training. Specifically, we first use delta perplexity scores to measure the \textit{Dependency Strength} between text segments in a given document. Then we refine this metric based on the \textit{Dependency Distance} of these segments to incorporate spatial relationships across long-contexts. Final results are calibrated with a \textit{Dependency Specificity} metric to prevent trivial dependencies introduced by repetitive patterns. Moreover, a random sampling approach is proposed to optimize the computational efficiency of ProLong. Comprehensive experiments on multiple benchmarks indicate that ProLong effectively identifies documents that carry long dependencies and LLMs trained on these documents exhibit significantly enhanced long-context modeling capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17915v1-abstract-full').style.display = 'none'; document.getElementById('2405.17915v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures, ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17034">arXiv:2405.17034</a> <span> [<a href="https://arxiv.org/pdf/2405.17034">pdf</a>, <a href="https://arxiv.org/format/2405.17034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FUGNN: Harmonizing Fairness and Utility in Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renqiang Luo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huafei Huang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shuo Yu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhuoyang Han</a>, <a href="/search/cs?searchtype=author&query=He%2C+E">Estrid He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiuzhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Feng Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17034v2-abstract-short" style="display: inline;"> Fairness-aware Graph Neural Networks (GNNs) often face a challenging trade-off, where prioritizing fairness may require compromising utility. In this work, we re-examine fairness through the lens of spectral graph theory, aiming to reconcile fairness and utility within the framework of spectral graph learning. We explore the correlation between sensitive features and spectrum in GNNs, using theore… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17034v2-abstract-full').style.display = 'inline'; document.getElementById('2405.17034v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17034v2-abstract-full" style="display: none;"> Fairness-aware Graph Neural Networks (GNNs) often face a challenging trade-off, where prioritizing fairness may require compromising utility. In this work, we re-examine fairness through the lens of spectral graph theory, aiming to reconcile fairness and utility within the framework of spectral graph learning. We explore the correlation between sensitive features and spectrum in GNNs, using theoretical analysis to delineate the similarity between original sensitive features and those after convolution under different spectra. Our analysis reveals a reduction in the impact of similarity when the eigenvectors associated with the largest magnitude eigenvalue exhibit directional similarity. Based on these theoretical insights, we propose FUGNN, a novel spectral graph learning approach that harmonizes the conflict between fairness and utility. FUGNN ensures algorithmic fairness and utility by truncating the spectrum and optimizing eigenvector distribution during the encoding process. The fairness-aware eigenvector selection reduces the impact of convolution on sensitive features while concurrently minimizing the sacrifice of utility. FUGNN further optimizes the distribution of eigenvectors through a transformer architecture. By incorporating the optimized spectrum into the graph convolution network, FUGNN effectively learns node representations. Experiments on six real-world datasets demonstrate the superiority of FUGNN over baseline methods. The codes are available at https://github.com/yushuowiki/FUGNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17034v2-abstract-full').style.display = 'none'; document.getElementById('2405.17034v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in SIGKDD 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16919">arXiv:2405.16919</a> <span> [<a href="https://arxiv.org/pdf/2405.16919">pdf</a>, <a href="https://arxiv.org/format/2405.16919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VoCoT: Unleashing Visually Grounded Multi-Step Reasoning in Large Multi-Modal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zejun Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruipu Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiwen Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+M">Minghui Qiu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhongyu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16919v2-abstract-short" style="display: inline;"> While large multi-modal models (LMMs) have exhibited impressive capabilities across diverse tasks, their effectiveness in handling complex tasks has been limited by the prevailing single-step reasoning paradigm. To this end, this paper proposes VoCoT, a multi-step Visually grounded object-centric Chain-of-Thought reasoning framework tailored for inference with LMMs. VoCoT is characterized by two k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16919v2-abstract-full').style.display = 'inline'; document.getElementById('2405.16919v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16919v2-abstract-full" style="display: none;"> While large multi-modal models (LMMs) have exhibited impressive capabilities across diverse tasks, their effectiveness in handling complex tasks has been limited by the prevailing single-step reasoning paradigm. To this end, this paper proposes VoCoT, a multi-step Visually grounded object-centric Chain-of-Thought reasoning framework tailored for inference with LMMs. VoCoT is characterized by two key features: (1) object-centric reasoning paths that revolve around cross-modal shared object-level information, and (2) visually grounded representation of object concepts in a multi-modal interleaved and aligned manner, which effectively bridges the modality gap within LMMs during long-term generation. Additionally, we construct an instruction dataset to facilitate LMMs in adapting to reasoning with VoCoT. By introducing VoCoT into the prevalent open-source LMM architecture, we introduce VolCano. With only 7B parameters and limited input resolution, VolCano demonstrates excellent performance across various scenarios, surpassing SOTA models, including GPT-4V, in tasks requiring complex reasoning. Our code, data and model will be available at https://github.com/RupertLuo/VoCoT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16919v2-abstract-full').style.display = 'none'; document.getElementById('2405.16919v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15232">arXiv:2405.15232</a> <span> [<a href="https://arxiv.org/pdf/2405.15232">pdf</a>, <a href="https://arxiv.org/format/2405.15232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DEEM: Diffusion Models Serve as the Eyes of Large Language Models for Image Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Longze Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wanwei He</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Ting-En Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zikai Song</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xiaobo Xia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+B">Binyuan Hui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15232v3-abstract-short" style="display: inline;"> The development of large language models (LLMs) has significantly advanced the emergence of large multimodal models (LMMs). While LMMs have achieved tremendous success by promoting the synergy between multimodal comprehension and creation, they often face challenges when confronted with out-of-distribution data, such as which can hardly distinguish orientation, quantity, color, structure, etc. Thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15232v3-abstract-full').style.display = 'inline'; document.getElementById('2405.15232v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15232v3-abstract-full" style="display: none;"> The development of large language models (LLMs) has significantly advanced the emergence of large multimodal models (LMMs). While LMMs have achieved tremendous success by promoting the synergy between multimodal comprehension and creation, they often face challenges when confronted with out-of-distribution data, such as which can hardly distinguish orientation, quantity, color, structure, etc. This is primarily due to their reliance on image encoders trained to encode images into task-relevant features, which may lead them to disregard irrelevant details. Delving into the modeling capabilities of diffusion models for images naturally prompts the question: Can diffusion models serve as the eyes of large language models for image perception? In this paper, we propose DEEM, a simple but effective approach that utilizes the generative feedback of diffusion models to align the semantic distributions of the image encoder. This addresses the drawbacks of previous methods that solely relied on image encoders like CLIP-ViT, thereby enhancing the model's resilience against out-of-distribution samples and reducing visual hallucinations. Importantly, this is achieved without requiring additional training modules and with fewer training parameters. We extensively evaluated DEEM on both our newly constructed RobustVQA benchmark and other well-known benchmarks, POPE and MMVP, for visual hallucination and perception. In particular, DEEM improves LMM's visual perception performance to a large extent (e.g., 4% higher on RobustVQA, 6.5% higher on MMVP and 12.8 % higher on POPE ). Compared to the state-of-the-art interleaved content generation models, DEEM exhibits enhanced robustness and a superior capacity to alleviate model hallucinations while utilizing fewer trainable parameters, less pre-training data (10%), and a smaller base model size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15232v3-abstract-full').style.display = 'none'; document.getElementById('2405.15232v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages. arXiv admin note: text overlap with arXiv:2401.10208 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13824">arXiv:2405.13824</a> <span> [<a href="https://arxiv.org/pdf/2405.13824">pdf</a>, <a href="https://arxiv.org/format/2405.13824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GMMFormer v2: An Uncertainty-aware Framework for Partially Relevant Video Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuting Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinpeng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruisheng Luo</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13824v1-abstract-short" style="display: inline;"> Given a text query, partially relevant video retrieval (PRVR) aims to retrieve untrimmed videos containing relevant moments. Due to the lack of moment annotations, the uncertainty lying in clip modeling and text-clip correspondence leads to major challenges. Despite the great progress, existing solutions either sacrifice efficiency or efficacy to capture varying and uncertain video moments. What's… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13824v1-abstract-full').style.display = 'inline'; document.getElementById('2405.13824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13824v1-abstract-full" style="display: none;"> Given a text query, partially relevant video retrieval (PRVR) aims to retrieve untrimmed videos containing relevant moments. Due to the lack of moment annotations, the uncertainty lying in clip modeling and text-clip correspondence leads to major challenges. Despite the great progress, existing solutions either sacrifice efficiency or efficacy to capture varying and uncertain video moments. What's worse, few methods have paid attention to the text-clip matching pattern under such uncertainty, exposing the risk of semantic collapse. To address these issues, we present GMMFormer v2, an uncertainty-aware framework for PRVR. For clip modeling, we improve a strong baseline GMMFormer with a novel temporal consolidation module upon multi-scale contextual features, which maintains efficiency and improves the perception for varying moments. To achieve uncertainty-aware text-clip matching, we upgrade the query diverse loss in GMMFormer to facilitate fine-grained uniformity and propose a novel optimal matching loss for fine-grained text-clip alignment. Their collaboration alleviates the semantic collapse phenomenon and neatly promotes accurate correspondence between texts and moments. We conduct extensive experiments and ablation studies on three PRVR benchmarks, demonstrating remarkable improvement of GMMFormer v2 compared to the past SOTA competitor and the versatility of uncertainty-aware text-clip matching for PRVR. Code is available at \url{https://github.com/huangmozhi9527/GMMFormer_v2}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13824v1-abstract-full').style.display = 'none'; document.getElementById('2405.13824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09543">arXiv:2405.09543</a> <span> [<a href="https://arxiv.org/pdf/2405.09543">pdf</a>, <a href="https://arxiv.org/format/2405.09543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Algorithmic Fairness: A Tolerance Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renqiang Luo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+T">Tao Tang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Feng Xia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaying Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengpei Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L+Y">Leo Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+W">Wei Xiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengqi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09543v1-abstract-short" style="display: inline;"> Recent advancements in machine learning and deep learning have brought algorithmic fairness into sharp focus, illuminating concerns over discriminatory decision making that negatively impacts certain individuals or groups. These concerns have manifested in legal, ethical, and societal challenges, including the erosion of trust in intelligent systems. In response, this survey delves into the existi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09543v1-abstract-full').style.display = 'inline'; document.getElementById('2405.09543v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09543v1-abstract-full" style="display: none;"> Recent advancements in machine learning and deep learning have brought algorithmic fairness into sharp focus, illuminating concerns over discriminatory decision making that negatively impacts certain individuals or groups. These concerns have manifested in legal, ethical, and societal challenges, including the erosion of trust in intelligent systems. In response, this survey delves into the existing literature on algorithmic fairness, specifically highlighting its multifaceted social consequences. We introduce a novel taxonomy based on 'tolerance', a term we define as the degree to which variations in fairness outcomes are acceptable, providing a structured approach to understanding the subtleties of fairness within algorithmic decisions. Our systematic review covers diverse industries, revealing critical insights into the balance between algorithmic decision making and social equity. By synthesizing these insights, we outline a series of emerging challenges and propose strategic directions for future research and policy making, with the goal of advancing the field towards more equitable algorithmic systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09543v1-abstract-full').style.display = 'none'; document.getElementById('2405.09543v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T01; 68W40 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; K.4.2; H.1.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17169">arXiv:2404.17169</a> <span> [<a href="https://arxiv.org/pdf/2404.17169">pdf</a>, <a href="https://arxiv.org/format/2404.17169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> FairGT: A Fairness-aware Graph Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renqiang Luo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huafei Huang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shuo Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiuzhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Feng Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17169v1-abstract-short" style="display: inline;"> The design of Graph Transformers (GTs) generally neglects considerations for fairness, resulting in biased outcomes against certain sensitive subgroups. Since GTs encode graph information without relying on message-passing mechanisms, conventional fairness-aware graph learning methods cannot be directly applicable to address these issues. To tackle this challenge, we propose FairGT, a Fairness-awa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17169v1-abstract-full').style.display = 'inline'; document.getElementById('2404.17169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17169v1-abstract-full" style="display: none;"> The design of Graph Transformers (GTs) generally neglects considerations for fairness, resulting in biased outcomes against certain sensitive subgroups. Since GTs encode graph information without relying on message-passing mechanisms, conventional fairness-aware graph learning methods cannot be directly applicable to address these issues. To tackle this challenge, we propose FairGT, a Fairness-aware Graph Transformer explicitly crafted to mitigate fairness concerns inherent in GTs. FairGT incorporates a meticulous structural feature selection strategy and a multi-hop node feature integration method, ensuring independence of sensitive features and bolstering fairness considerations. These fairness-aware graph information encodings seamlessly integrate into the Transformer framework for downstream tasks. We also prove that the proposed fair structural topology encoding with adjacency matrix eigenvector selection and multi-hop integration are theoretically effective. Empirical evaluations conducted across five real-world datasets demonstrate FairGT's superiority in fairness metrics over existing graph transformers, graph neural networks, and state-of-the-art fairness-aware graph learning approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17169v1-abstract-full').style.display = 'none'; document.getElementById('2404.17169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IJCAI2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10514">arXiv:2404.10514</a> <span> [<a href="https://arxiv.org/pdf/2404.10514">pdf</a>, <a href="https://arxiv.org/format/2404.10514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Simple $k$-crashing Plan with a Good Approximation Ratio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruixi Luo</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+K">Kai Jin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zelin Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10514v1-abstract-short" style="display: inline;"> In project management, a project is typically described as an activity-on-edge network (AOE network), where each activity / job is represented as an edge of some network $N$ (which is a DAG). Some jobs must be finished before others can be started, as described by the topology structure of $N$. It is known that job $j_i$ in normal speed would require $b_i$ days to be finished after it is started.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10514v1-abstract-full').style.display = 'inline'; document.getElementById('2404.10514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10514v1-abstract-full" style="display: none;"> In project management, a project is typically described as an activity-on-edge network (AOE network), where each activity / job is represented as an edge of some network $N$ (which is a DAG). Some jobs must be finished before others can be started, as described by the topology structure of $N$. It is known that job $j_i$ in normal speed would require $b_i$ days to be finished after it is started. Given the network $N$ with the associated edge lengths $b_1,\ldots,b_m$, the duration of the project is determined, which equals the length of the critical path (namely, the longest path) of $N$. To speed up the project (i.e. reduce the duration), the manager can crash a few jobs (namely, reduce the length of the corresponding edges) by investing extra resources into that job. However, the time for completing $j_i$ has a lower bound due to technological limits -- it requires at least $a_i$ days to be completed. Moreover, it is expensive to buy resources. Given $N$ and an integer $k\geq 1$, the $k$-crashing problem asks the minimum amount of resources required to speed up the project by $k$ days. We show a simple and efficient algorithm with an approximation ratio $\frac{1}{1}+\ldots+\frac{1}{k}$ for this problem. We also study a related problem called $k$-LIS, in which we are given a sequence $蠅$ of numbers and we aim to find $k$ disjoint increasing subsequence of $蠅$ with the largest total length. We show a $(1-\frac{1}{e})$-approximation algorithm which is simple and efficient. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10514v1-abstract-full').style.display = 'none'; document.getElementById('2404.10514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> K.6.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09897">arXiv:2404.09897</a> <span> [<a href="https://arxiv.org/pdf/2404.09897">pdf</a>, <a href="https://arxiv.org/format/2404.09897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Progressive Knowledge Graph Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiayi Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruilin Luo</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiaqi Sun</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09897v1-abstract-short" style="display: inline;"> Knowledge Graph Completion (KGC) has emerged as a promising solution to address the issue of incompleteness within Knowledge Graphs (KGs). Traditional KGC research primarily centers on triple classification and link prediction. Nevertheless, we contend that these tasks do not align well with real-world scenarios and merely serve as surrogate benchmarks. In this paper, we investigate three crucial… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09897v1-abstract-full').style.display = 'inline'; document.getElementById('2404.09897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09897v1-abstract-full" style="display: none;"> Knowledge Graph Completion (KGC) has emerged as a promising solution to address the issue of incompleteness within Knowledge Graphs (KGs). Traditional KGC research primarily centers on triple classification and link prediction. Nevertheless, we contend that these tasks do not align well with real-world scenarios and merely serve as surrogate benchmarks. In this paper, we investigate three crucial processes relevant to real-world construction scenarios: (a) the verification process, which arises from the necessity and limitations of human verifiers; (b) the mining process, which identifies the most promising candidates for verification; and (c) the training process, which harnesses verified data for subsequent utilization; in order to achieve a transition toward more realistic challenges. By integrating these three processes, we introduce the Progressive Knowledge Graph Completion (PKGC) task, which simulates the gradual completion of KGs in real-world scenarios. Furthermore, to expedite PKGC processing, we propose two acceleration modules: Optimized Top-$k$ algorithm and Semantic Validity Filter. These modules significantly enhance the efficiency of the mining procedure. Our experiments demonstrate that performance in link prediction does not accurately reflect performance in PKGC. A more in-depth analysis reveals the key factors influencing the results and provides potential directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09897v1-abstract-full').style.display = 'none'; document.getElementById('2404.09897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07584">arXiv:2404.07584</a> <span> [<a href="https://arxiv.org/pdf/2404.07584">pdf</a>, <a href="https://arxiv.org/format/2404.07584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> UltraEval: A Lightweight Platform for Flexible and Comprehensive Evaluation for LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+C">Chaoqun He</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renjie Luo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengding Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuanqian Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hanghao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajie Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07584v3-abstract-short" style="display: inline;"> Evaluation is pivotal for refining Large Language Models (LLMs), pinpointing their capabilities, and guiding enhancements. The rapid development of LLMs calls for a lightweight and easy-to-use framework for swift evaluation deployment. However, considering various implementation details, developing a comprehensive evaluation platform is never easy. Existing platforms are often complex and poorly m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07584v3-abstract-full').style.display = 'inline'; document.getElementById('2404.07584v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07584v3-abstract-full" style="display: none;"> Evaluation is pivotal for refining Large Language Models (LLMs), pinpointing their capabilities, and guiding enhancements. The rapid development of LLMs calls for a lightweight and easy-to-use framework for swift evaluation deployment. However, considering various implementation details, developing a comprehensive evaluation platform is never easy. Existing platforms are often complex and poorly modularized, hindering seamless incorporation into research workflows. This paper introduces UltraEval, a user-friendly evaluation framework characterized by its lightweight nature, comprehensiveness, modularity, and efficiency. We identify and reimplement three core components of model evaluation (models, data, and metrics). The resulting composability allows for the free combination of different models, tasks, prompts, benchmarks, and metrics within a unified evaluation workflow. Additionally, UltraEval supports diverse models owing to a unified HTTP service and provides sufficient inference acceleration. UltraEval is now available for researchers publicly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07584v3-abstract-full').style.display = 'none'; document.getElementById('2404.07584v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2024 System Demostration Track, update</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03828">arXiv:2404.03828</a> <span> [<a href="https://arxiv.org/pdf/2404.03828">pdf</a>, <a href="https://arxiv.org/format/2404.03828">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Outlier-Efficient Hopfield Layers for Large Transformer-Based Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J+Y">Jerry Yao-Chieh Hu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+P">Pei-Hsuan Chang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Robin Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong-Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weijian Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei-Po Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Han Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03828v2-abstract-short" style="display: inline;"> We introduce an Outlier-Efficient Modern Hopfield Model (termed $\mathrm{OutEffHop}$) and use it to address the outlier inefficiency problem of {training} gigantic transformer-based models. Our main contribution is a novel associative memory model facilitating \textit{outlier-efficient} associative memory retrievals. Interestingly, this memory model manifests a model-based interpretation of an out… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03828v2-abstract-full').style.display = 'inline'; document.getElementById('2404.03828v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03828v2-abstract-full" style="display: none;"> We introduce an Outlier-Efficient Modern Hopfield Model (termed $\mathrm{OutEffHop}$) and use it to address the outlier inefficiency problem of {training} gigantic transformer-based models. Our main contribution is a novel associative memory model facilitating \textit{outlier-efficient} associative memory retrievals. Interestingly, this memory model manifests a model-based interpretation of an outlier-efficient attention mechanism (${\rm Softmax}_1$): it is an approximation of the memory retrieval process of $\mathrm{OutEffHop}$. Methodologically, this allows us to introduce novel outlier-efficient Hopfield layers as powerful alternatives to traditional attention mechanisms, with superior post-quantization performance. Theoretically, the Outlier-Efficient Modern Hopfield Model retains and improves the desirable properties of standard modern Hopfield models, including fixed point convergence and exponential storage capacity. Empirically, we demonstrate the efficacy of the proposed model across large-scale transformer-based and Hopfield-based models (including BERT, OPT, ViT, and STanHop-Net), benchmarking against state-of-the-art methods like $\mathtt{Clipped\_Softmax}$ and $\mathtt{Gated\_Attention}$. Notably, $\mathrm{OutEffHop}$ achieves an average reduction of 22+\% in average kurtosis and 26+\% in the maximum infinity norm of model outputs across four models. Code is available at \href{https://github.com/MAGICS-LAB/OutEffHop}{GitHub}; models are on \href{https://huggingface.co/collections/magicslabnu/outeffhop-6610fcede8d2cda23009a98f}{Hugging Face Hub}; future updates are on \href{https://arxiv.org/abs/2404.03828}{arXiv}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03828v2-abstract-full').style.display = 'none'; document.getElementById('2404.03828v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICML 2024; v2 updated to camera-ready version; Code available at https://github.com/MAGICS-LAB/OutEffHop; Models are on Hugging Face: https://huggingface.co/collections/magicslabnu/outeffhop-6610fcede8d2cda23009a98f</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01994">arXiv:2404.01994</a> <span> [<a href="https://arxiv.org/pdf/2404.01994">pdf</a>, <a href="https://arxiv.org/format/2404.01994">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DELAN: Dual-Level Alignment for Vision-and-Language Navigation by Cross-Modal Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+M">Mengfei Du</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Binhao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiwen Zhang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhihao Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zejun Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruipu Luo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhongyu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01994v1-abstract-short" style="display: inline;"> Vision-and-Language navigation (VLN) requires an agent to navigate in unseen environment by following natural language instruction. For task completion, the agent needs to align and integrate various navigation modalities, including instruction, observation and navigation history. Existing works primarily concentrate on cross-modal attention at the fusion stage to achieve this objective. Neverthel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01994v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01994v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01994v1-abstract-full" style="display: none;"> Vision-and-Language navigation (VLN) requires an agent to navigate in unseen environment by following natural language instruction. For task completion, the agent needs to align and integrate various navigation modalities, including instruction, observation and navigation history. Existing works primarily concentrate on cross-modal attention at the fusion stage to achieve this objective. Nevertheless, modality features generated by disparate uni-encoders reside in their own spaces, leading to a decline in the quality of cross-modal fusion and decision. To address this problem, we propose a Dual-levEL AligNment (DELAN) framework by cross-modal contrastive learning. This framework is designed to align various navigation-related modalities before fusion, thereby enhancing cross-modal interaction and action decision-making. Specifically, we divide the pre-fusion alignment into dual levels: instruction-history level and landmark-observation level according to their semantic correlations. We also reconstruct a dual-level instruction for adaptation to the dual-level alignment. As the training signals for pre-fusion alignment are extremely limited, self-supervised contrastive learning strategies are employed to enforce the matching between different modalities. Our approach seamlessly integrates with the majority of existing models, resulting in improved navigation performance on various VLN benchmarks, including R2R, R4R, RxR and CVDN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01994v1-abstract-full').style.display = 'none'; document.getElementById('2404.01994v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by LREC-COLING 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13177">arXiv:2403.13177</a> <span> [<a href="https://arxiv.org/pdf/2403.13177">pdf</a>, <a href="https://arxiv.org/format/2403.13177">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> User-customizable Shared Control for Robot Teleoperation via Virtual Reality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rui Luo</a>, <a href="/search/cs?searchtype=author&query=Zolotas%2C+M">Mark Zolotas</a>, <a href="/search/cs?searchtype=author&query=Moore%2C+D">Drake Moore</a>, <a href="/search/cs?searchtype=author&query=Padir%2C+T">Taskin Padir</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13177v2-abstract-short" style="display: inline;"> Shared control can ease and enhance a human operator's ability to teleoperate robots, particularly for intricate tasks demanding fine control over multiple degrees of freedom. However, the arbitration process dictating how much autonomous assistance to administer in shared control can confuse novice operators and impede their understanding of the robot's behavior. To overcome these adverse side-ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13177v2-abstract-full').style.display = 'inline'; document.getElementById('2403.13177v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13177v2-abstract-full" style="display: none;"> Shared control can ease and enhance a human operator's ability to teleoperate robots, particularly for intricate tasks demanding fine control over multiple degrees of freedom. However, the arbitration process dictating how much autonomous assistance to administer in shared control can confuse novice operators and impede their understanding of the robot's behavior. To overcome these adverse side-effects, we propose a novel formulation of shared control that enables operators to tailor the arbitration to their unique capabilities and preferences. Unlike prior approaches to customizable shared control where users could indirectly modify the latent parameters of the arbitration function by issuing a feedback command, we instead make these parameters observable and directly editable via a virtual reality (VR) interface. We present our user-customizable shared control method for a teleoperation task in SE(3), known as the buzz wire game. A user study is conducted with participants teleoperating a robotic arm in VR to complete the game. The experiment spanned two weeks per subject to investigate longitudinal trends. Our findings reveal that users allowed to interactively tune the arbitration parameters across trials generalize well to adaptations in the task, exhibiting improvements in precision and fluency over direct teleoperation and conventional shared control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13177v2-abstract-full').style.display = 'none'; document.getElementById('2403.13177v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at IROS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.14809">arXiv:2402.14809</a> <span> [<a href="https://arxiv.org/pdf/2402.14809">pdf</a>, <a href="https://arxiv.org/format/2402.14809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CriticBench: Benchmarking LLMs for Critique-Correct Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zicheng Lin</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+Z">Zhibin Gou</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+T">Tian Liang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruilin Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haowei Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.14809v4-abstract-short" style="display: inline;"> The ability of Large Language Models (LLMs) to critique and refine their reasoning is crucial for their application in evaluation, feedback provision, and self-improvement. This paper introduces CriticBench, a comprehensive benchmark designed to assess LLMs' abilities to critique and rectify their reasoning across a variety of tasks. CriticBench encompasses five reasoning domains: mathematical, co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.14809v4-abstract-full').style.display = 'inline'; document.getElementById('2402.14809v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.14809v4-abstract-full" style="display: none;"> The ability of Large Language Models (LLMs) to critique and refine their reasoning is crucial for their application in evaluation, feedback provision, and self-improvement. This paper introduces CriticBench, a comprehensive benchmark designed to assess LLMs' abilities to critique and rectify their reasoning across a variety of tasks. CriticBench encompasses five reasoning domains: mathematical, commonsense, symbolic, coding, and algorithmic. It compiles 15 datasets and incorporates responses from three LLM families. Utilizing CriticBench, we evaluate and dissect the performance of 17 LLMs in generation, critique, and correction reasoning, i.e., GQC reasoning. Our findings reveal: (1) a linear relationship in GQC capabilities, with critique-focused training markedly enhancing performance; (2) a task-dependent variation in correction effectiveness, with logic-oriented tasks being more amenable to correction; (3) GQC knowledge inconsistencies that decrease as model size increases; and (4) an intriguing inter-model critiquing dynamic, where stronger models are better at critiquing weaker ones, while weaker models can surprisingly surpass stronger ones in their self-critique. We hope these insights into the nuanced critique-correct reasoning of LLMs will foster further research in LLM critique and self-improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.14809v4-abstract-full').style.display = 'none'; document.getElementById('2402.14809v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.14008">arXiv:2402.14008</a> <span> [<a href="https://arxiv.org/pdf/2402.14008">pdf</a>, <a href="https://arxiv.org/format/2402.14008">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OlympiadBench: A Challenging Benchmark for Promoting AGI with Olympiad-Level Bilingual Multimodal Scientific Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+C">Chaoqun He</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renjie Luo</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yuzhuo Bai</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengding Hu</a>, <a href="/search/cs?searchtype=author&query=Thai%2C+Z+L">Zhen Leng Thai</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Junhao Shen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinyi Hu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yujie Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+L">Lei Qi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.14008v2-abstract-short" style="display: inline;"> Recent advancements have seen Large Language Models (LLMs) and Large Multimodal Models (LMMs) surpassing general human capabilities in various tasks, approaching the proficiency level of human experts across multiple domains. With traditional benchmarks becoming less challenging for these models, new rigorous challenges are essential to gauge their advanced abilities. In this work, we present Olym… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.14008v2-abstract-full').style.display = 'inline'; document.getElementById('2402.14008v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.14008v2-abstract-full" style="display: none;"> Recent advancements have seen Large Language Models (LLMs) and Large Multimodal Models (LMMs) surpassing general human capabilities in various tasks, approaching the proficiency level of human experts across multiple domains. With traditional benchmarks becoming less challenging for these models, new rigorous challenges are essential to gauge their advanced abilities. In this work, we present OlympiadBench, an Olympiad-level bilingual multimodal scientific benchmark, featuring 8,476 problems from Olympiad-level mathematics and physics competitions, including the Chinese college entrance exam. Each problem is detailed with expert-level annotations for step-by-step reasoning. Evaluating top-tier models on OlympiadBench, we implement a comprehensive assessment methodology to accurately evaluate model responses. Notably, the best-performing model, GPT-4V, attains an average score of 17.97% on OlympiadBench, with a mere 10.74% in physics, highlighting the benchmark rigor and the intricacy of physical reasoning. Our analysis orienting GPT-4V points out prevalent issues with hallucinations, knowledge omissions, and logical fallacies. We hope that our challenging benchmark can serve as a valuable resource for helping future AGI research endeavors. The data and evaluation code are available at \url{https://github.com/OpenBMB/OlympiadBench} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.14008v2-abstract-full').style.display = 'none'; document.getElementById('2402.14008v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2024 (main), update</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.07376">arXiv:2402.07376</a> <span> [<a href="https://arxiv.org/pdf/2402.07376">pdf</a>, <a href="https://arxiv.org/format/2402.07376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Discovery of Object-Centric Neural Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rundong Luo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hong-Xing Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiajun Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.07376v1-abstract-short" style="display: inline;"> We study inferring 3D object-centric scene representations from a single image. While recent methods have shown potential in unsupervised 3D object discovery from simple synthetic images, they fail to generalize to real-world scenes with visually rich and diverse objects. This limitation stems from their object representations, which entangle objects' intrinsic attributes like shape and appearance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07376v1-abstract-full').style.display = 'inline'; document.getElementById('2402.07376v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.07376v1-abstract-full" style="display: none;"> We study inferring 3D object-centric scene representations from a single image. While recent methods have shown potential in unsupervised 3D object discovery from simple synthetic images, they fail to generalize to real-world scenes with visually rich and diverse objects. This limitation stems from their object representations, which entangle objects' intrinsic attributes like shape and appearance with extrinsic, viewer-centric properties such as their 3D location. To address this bottleneck, we propose Unsupervised discovery of Object-Centric neural Fields (uOCF). uOCF focuses on learning the intrinsics of objects and models the extrinsics separately. Our approach significantly improves systematic generalization, thus enabling unsupervised learning of high-fidelity object-centric scene representations from sparse real-world images. To evaluate our approach, we collect three new datasets, including two real kitchen environments. Extensive experiments show that uOCF enables unsupervised discovery of visually rich objects from a single real image, allowing applications such as 3D object segmentation and scene manipulation. Notably, uOCF demonstrates zero-shot generalization to unseen objects from a single real image. Project page: https://red-fairy.github.io/uOCF/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07376v1-abstract-full').style.display = 'none'; document.getElementById('2402.07376v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.06072">arXiv:2401.06072</a> <span> [<a href="https://arxiv.org/pdf/2401.06072">pdf</a>, <a href="https://arxiv.org/format/2401.06072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Chain of History: Learning and Forecasting with LLMs for Temporal Knowledge Graph Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruilin Luo</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+T">Tianle Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoling Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junzhe Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zicheng Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiayi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.06072v2-abstract-short" style="display: inline;"> Temporal Knowledge Graph Completion (TKGC) is a complex task involving the prediction of missing event links at future timestamps by leveraging established temporal structural knowledge. This paper aims to provide a comprehensive perspective on harnessing the advantages of Large Language Models (LLMs) for reasoning in temporal knowledge graphs, presenting an easily transferable pipeline. In terms… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06072v2-abstract-full').style.display = 'inline'; document.getElementById('2401.06072v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.06072v2-abstract-full" style="display: none;"> Temporal Knowledge Graph Completion (TKGC) is a complex task involving the prediction of missing event links at future timestamps by leveraging established temporal structural knowledge. This paper aims to provide a comprehensive perspective on harnessing the advantages of Large Language Models (LLMs) for reasoning in temporal knowledge graphs, presenting an easily transferable pipeline. In terms of graph modality, we underscore the LLMs' prowess in discerning the structural information of pivotal nodes within the historical chain. As for the generation mode of the LLMs utilized for inference, we conduct an exhaustive exploration into the variances induced by a range of inherent factors in LLMs, with particular attention to the challenges in comprehending reverse logic. We adopt a parameter-efficient fine-tuning strategy to harmonize the LLMs with the task requirements, facilitating the learning of the key knowledge highlighted earlier. Comprehensive experiments are undertaken on several widely recognized datasets, revealing that our framework exceeds or parallels existing methods across numerous popular metrics. Additionally, we execute a substantial range of ablation experiments and draw comparisons with several advanced commercial LLMs, to investigate the crucial factors influencing LLMs' performance in structured temporal knowledge inference tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06072v2-abstract-full').style.display = 'none'; document.getElementById('2401.06072v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages; typos corrected, references added</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.09542">arXiv:2312.09542</a> <span> [<a href="https://arxiv.org/pdf/2312.09542">pdf</a>, <a href="https://arxiv.org/format/2312.09542">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Marathon: A Race Through the Realm of Long Context with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziqiang Liu</a>, <a href="/search/cs?searchtype=author&query=yang%2C+J">Jiaxi yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junhao Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Longze Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.09542v2-abstract-short" style="display: inline;"> With the advancement of large language models (LLMs) and the expansion of their context windows, existing long-context benchmarks fall short in effectively evaluating the models' comprehension and reasoning abilities in extended texts. Moreover, conventional benchmarks relying on F1 metrics often inaccurately score responses: they may undervalue correct answers that differ from the reference respo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09542v2-abstract-full').style.display = 'inline'; document.getElementById('2312.09542v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.09542v2-abstract-full" style="display: none;"> With the advancement of large language models (LLMs) and the expansion of their context windows, existing long-context benchmarks fall short in effectively evaluating the models' comprehension and reasoning abilities in extended texts. Moreover, conventional benchmarks relying on F1 metrics often inaccurately score responses: they may undervalue correct answers that differ from the reference responses and overvalue incorrect ones that resemble the reference texts. In response to these limitations, we introduce Marathon, a novel evaluation benchmark that adopts a multiple-choice question format. It is specifically designed to overcome the constraints of previous benchmarks and provide a rapid, precise, and unbiased appraisal of the long-context comprehension skills of large language models. We conducted comprehensive evaluations on the Marathon benchmark with a range of state-of-the-art LLMs and assessed the effectiveness of various optimization strategies tailored for long-context generation. We anticipate that the Marathon benchmark and its associated leaderboard will enable a more precise and equitable evaluation of LLMs' capabilities in understanding and reasoning over extended contexts. Marathon is available at https://github.com/Hambaobao/Marathon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09542v2-abstract-full').style.display = 'none'; document.getElementById('2312.09542v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.16452">arXiv:2311.16452</a> <span> [<a href="https://arxiv.org/pdf/2311.16452">pdf</a>, <a href="https://arxiv.org/format/2311.16452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nori%2C+H">Harsha Nori</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y+T">Yin Tat Lee</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Carignan%2C+D">Dean Carignan</a>, <a href="/search/cs?searchtype=author&query=Edgar%2C+R">Richard Edgar</a>, <a href="/search/cs?searchtype=author&query=Fusi%2C+N">Nicolo Fusi</a>, <a href="/search/cs?searchtype=author&query=King%2C+N">Nicholas King</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+J">Jonathan Larson</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuanzhi Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weishung Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renqian Luo</a>, <a href="/search/cs?searchtype=author&query=McKinney%2C+S+M">Scott Mayer McKinney</a>, <a href="/search/cs?searchtype=author&query=Ness%2C+R+O">Robert Osazuwa Ness</a>, <a href="/search/cs?searchtype=author&query=Poon%2C+H">Hoifung Poon</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+T">Tao Qin</a>, <a href="/search/cs?searchtype=author&query=Usuyama%2C+N">Naoto Usuyama</a>, <a href="/search/cs?searchtype=author&query=White%2C+C">Chris White</a>, <a href="/search/cs?searchtype=author&query=Horvitz%2C+E">Eric Horvitz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.16452v1-abstract-short" style="display: inline;"> Generalist foundation models such as GPT-4 have displayed surprising capabilities in a wide variety of domains and tasks. Yet, there is a prevalent assumption that they cannot match specialist capabilities of fine-tuned models. For example, most explorations to date on medical competency benchmarks have leveraged domain-specific training, as exemplified by efforts on BioGPT and Med-PaLM. We build… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16452v1-abstract-full').style.display = 'inline'; document.getElementById('2311.16452v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.16452v1-abstract-full" style="display: none;"> Generalist foundation models such as GPT-4 have displayed surprising capabilities in a wide variety of domains and tasks. Yet, there is a prevalent assumption that they cannot match specialist capabilities of fine-tuned models. For example, most explorations to date on medical competency benchmarks have leveraged domain-specific training, as exemplified by efforts on BioGPT and Med-PaLM. We build on a prior study of GPT-4's capabilities on medical challenge benchmarks in the absence of special training. Rather than using simple prompting to highlight the model's out-of-the-box capabilities, we perform a systematic exploration of prompt engineering. We find that prompting innovation can unlock deeper specialist capabilities and show that GPT-4 easily tops prior leading results for medical benchmarks. The prompting methods we explore are general purpose, and make no specific use of domain expertise, removing the need for expert-curated content. Our experimental design carefully controls for overfitting during the prompt engineering process. We introduce Medprompt, based on a composition of several prompting strategies. With Medprompt, GPT-4 achieves state-of-the-art results on all nine of the benchmark datasets in the MultiMedQA suite. The method outperforms leading specialist models such as Med-PaLM 2 by a significant margin with an order of magnitude fewer calls to the model. Steering GPT-4 with Medprompt achieves a 27% reduction in error rate on the MedQA dataset over the best methods to date achieved with specialist models and surpasses a score of 90% for the first time. Beyond medical problems, we show the power of Medprompt to generalize to other domains and provide evidence for the broad applicability of the approach via studies of the strategy on exams in electrical engineering, machine learning, philosophy, accounting, law, nursing, and clinical psychology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16452v1-abstract-full').style.display = 'none'; document.getElementById('2311.16452v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 7 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.16975">arXiv:2309.16975</a> <span> [<a href="https://arxiv.org/pdf/2309.16975">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ACCESS.2023.3320809">10.1109/ACCESS.2023.3320809 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Perceptual Tone Mapping Model for High Dynamic Range Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mehmood%2C+I">Imran Mehmood</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xinye Shi</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+M+U">M. Usman Khan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M+R">Ming Ronnier Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.16975v1-abstract-short" style="display: inline;"> One of the key challenges in tone mapping is to preserve the perceptual quality of high dynamic range (HDR) images when mapping them to standard dynamic range (SDR) displays. Traditional tone mapping operators (TMOs) compress the luminance of HDR images without considering the surround and display conditions emanating into suboptimal results. Current research addresses this challenge by incorporat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16975v1-abstract-full').style.display = 'inline'; document.getElementById('2309.16975v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.16975v1-abstract-full" style="display: none;"> One of the key challenges in tone mapping is to preserve the perceptual quality of high dynamic range (HDR) images when mapping them to standard dynamic range (SDR) displays. Traditional tone mapping operators (TMOs) compress the luminance of HDR images without considering the surround and display conditions emanating into suboptimal results. Current research addresses this challenge by incorporating perceptual color appearance attributes. In this work, we propose a TMO (TMOz) that leverages CIECAM16 perceptual attributes, i.e., brightness, colorfulness, and hue. TMOz accounts for the effects of both the surround and the display conditions to achieve more optimal colorfulness reproduction. The perceptual brightness is compressed, and the perceptual color scales, i.e., colorfulness and hue are derived from HDR images by employing CIECAM16 color adaptation equations. A psychophysical experiment was conducted to automate the brightness compression parameter. The model employs fully automatic and adaptive approach, obviating the requirement for manual parameter selection. TMOz was evaluated in terms of contrast, colorfulness and overall image quality. The objective and subjective evaluation methods revealed that the proposed model outperformed the state-of-the-art TMOs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16975v1-abstract-full').style.display = 'none'; document.getElementById('2309.16975v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.13834">arXiv:2309.13834</a> <span> [<a href="https://arxiv.org/pdf/2309.13834">pdf</a>, <a href="https://arxiv.org/format/2309.13834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Prior Bilinear Based Models for Knowledge Graph Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiayi Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruilin Luo</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiaqi Sun</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.13834v1-abstract-short" style="display: inline;"> Bilinear based models are powerful and widely used approaches for Knowledge Graphs Completion (KGC). Although bilinear based models have achieved significant advances, these studies mainly concentrate on posterior properties (based on evidence, e.g. symmetry pattern) while neglecting the prior properties. In this paper, we find a prior property named "the law of identity" that cannot be captured b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13834v1-abstract-full').style.display = 'inline'; document.getElementById('2309.13834v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.13834v1-abstract-full" style="display: none;"> Bilinear based models are powerful and widely used approaches for Knowledge Graphs Completion (KGC). Although bilinear based models have achieved significant advances, these studies mainly concentrate on posterior properties (based on evidence, e.g. symmetry pattern) while neglecting the prior properties. In this paper, we find a prior property named "the law of identity" that cannot be captured by bilinear based models, which hinders them from comprehensively modeling the characteristics of KGs. To address this issue, we introduce a solution called Unit Ball Bilinear Model (UniBi). This model not only achieves theoretical superiority but also offers enhanced interpretability and performance by minimizing ineffective learning through minimal constraints. Experiments demonstrate that UniBi models the prior property and verify its interpretability and performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13834v1-abstract-full').style.display = 'none'; document.getElementById('2309.13834v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Luo%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Luo%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository