Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 94 results for author: <span class="mathjax">Fei, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Fei%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Fei, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Fei%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Fei, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Fei%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Fei%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Fei%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12584">arXiv:2411.12584</a> <span> [<a href="https://arxiv.org/pdf/2411.12584">pdf</a>, <a href="https://arxiv.org/format/2411.12584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Leveraging MLLM Embeddings and Attribute Smoothing for Compositional Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xudong Yan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Songhe Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yueguan Lin</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Haojun Fei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12584v1-abstract-short" style="display: inline;"> Compositional zero-shot learning (CZSL) aims to recognize novel compositions of attributes and objects learned from seen compositions. Previous works disentangle attribute and object by extracting shared and exclusive parts between image pairs sharing the same attribute (object), as well as aligning them with pretrained word embeddings to improve unseen attribute-object recognition. Despite the si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12584v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12584v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12584v1-abstract-full" style="display: none;"> Compositional zero-shot learning (CZSL) aims to recognize novel compositions of attributes and objects learned from seen compositions. Previous works disentangle attribute and object by extracting shared and exclusive parts between image pairs sharing the same attribute (object), as well as aligning them with pretrained word embeddings to improve unseen attribute-object recognition. Despite the significant achievements of existing efforts, they are hampered by three limitations: (1) the efficacy of disentanglement is compromised due to the influence of the background and the intricate entanglement of attribute with object in the same parts. (2) existing word embeddings fail to capture complex multimodal semantic information. (3) overconfidence exhibited by existing models in seen compositions hinders their generalization to novel compositions. Being aware of these, we propose a novel framework named Multimodal Large Language Model (MLLM) embeddings and attribute smoothing guided disentanglement (TRIDENT) for CZSL. First, we leverage feature adaptive aggregation modules to mitigate the impact of background, and utilize learnable condition masks to capture multigranularity features for disentanglement. Then, the last hidden states of MLLM are employed as word embeddings for their superior representation capabilities. Moreover, we propose attribute smoothing with auxiliary attributes generated by Large Language Model (LLM) for seen compositions, addressing the issue of overconfidence by encouraging the model to learn more attributes in one given composition. Extensive experiments demonstrate that TRIDENT achieves state-of-the-art performance on three benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12584v1-abstract-full').style.display = 'none'; document.getElementById('2411.12584v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00304">arXiv:2411.00304</a> <span> [<a href="https://arxiv.org/pdf/2411.00304">pdf</a>, <a href="https://arxiv.org/format/2411.00304">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Unified Generative and Discriminative Training for Multi-modal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chow%2C+W">Wei Chow</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juncheng Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qifan Yu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+K">Kaihang Pan</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zhiqi Ge</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuai Yang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siliang Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qianru Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00304v1-abstract-short" style="display: inline;"> In recent times, Vision-Language Models (VLMs) have been trained under two predominant paradigms. Generative training has enabled Multimodal Large Language Models (MLLMs) to tackle various complex tasks, yet issues such as hallucinations and weak object discrimination persist. Discriminative training, exemplified by models like CLIP, excels in zero-shot image-text classification and retrieval, yet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00304v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00304v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00304v1-abstract-full" style="display: none;"> In recent times, Vision-Language Models (VLMs) have been trained under two predominant paradigms. Generative training has enabled Multimodal Large Language Models (MLLMs) to tackle various complex tasks, yet issues such as hallucinations and weak object discrimination persist. Discriminative training, exemplified by models like CLIP, excels in zero-shot image-text classification and retrieval, yet struggles with complex scenarios requiring fine-grained semantic differentiation. This paper addresses these challenges by proposing a unified approach that integrates the strengths of both paradigms. Considering interleaved image-text sequences as the general format of input samples, we introduce a structure-induced training strategy that imposes semantic relationships between input samples and the MLLM's hidden state. This approach enhances the MLLM's ability to capture global semantics and distinguish fine-grained semantics. By leveraging dynamic sequence alignment within the Dynamic Time Warping framework and integrating a novel kernel for fine-grained semantic differentiation, our method effectively balances generative and discriminative tasks. Extensive experiments demonstrate the effectiveness of our approach, achieving state-of-the-art results in multiple generative tasks, especially those requiring cognitive and discrimination abilities. Additionally, our method surpasses discriminative benchmarks in interleaved and fine-grained retrieval tasks. By employing a retrieval-augmented generation strategy, our approach further enhances performance in some generative tasks within one model, offering a promising direction for future research in vision-language modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00304v1-abstract-full').style.display = 'none'; document.getElementById('2411.00304v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20482">arXiv:2410.20482</a> <span> [<a href="https://arxiv.org/pdf/2410.20482">pdf</a>, <a href="https://arxiv.org/format/2410.20482">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> What Factors Affect Multi-Modal In-Context Learning? An In-Depth Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+L">Libo Qin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qiguang Chen</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Min Li</a>, <a href="/search/cs?searchtype=author&query=Che%2C+W">Wanxiang Che</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20482v1-abstract-short" style="display: inline;"> Recently, rapid advancements in Multi-Modal In-Context Learning (MM-ICL) have achieved notable success, which is capable of achieving superior performance across various tasks without requiring additional parameter tuning. However, the underlying rules for the effectiveness of MM-ICL remain under-explored. To fill this gap, this work aims to investigate the research question: "What factors affect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20482v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20482v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20482v1-abstract-full" style="display: none;"> Recently, rapid advancements in Multi-Modal In-Context Learning (MM-ICL) have achieved notable success, which is capable of achieving superior performance across various tasks without requiring additional parameter tuning. However, the underlying rules for the effectiveness of MM-ICL remain under-explored. To fill this gap, this work aims to investigate the research question: "What factors affect the performance of MM-ICL?'' To this end, we investigate extensive experiments on the three core steps of MM-ICL including demonstration retrieval, demonstration ordering, and prompt construction using 6 vision large language models and 20 strategies. Our findings highlight (1) the necessity of a multi-modal retriever for demonstration retrieval, (2) the importance of intra-demonstration ordering over inter-demonstration ordering, and (3) the enhancement of task comprehension through introductory instructions in prompts. We hope this study can serve as a foundational guide for optimizing MM-ICL strategies in future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20482v1-abstract-full').style.display = 'none'; document.getElementById('2410.20482v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15312">arXiv:2410.15312</a> <span> [<a href="https://arxiv.org/pdf/2410.15312">pdf</a>, <a href="https://arxiv.org/format/2410.15312">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Synergistic Dual Spatial-aware Generation of Image-to-Text and Text-to-Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Libo Qin</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jiayi Ji</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hongyuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jianguo Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15312v1-abstract-short" style="display: inline;"> In the visual spatial understanding (VSU) area, spatial image-to-text (SI2T) and spatial text-to-image (ST2I) are two fundamental tasks that appear in dual form. Existing methods for standalone SI2T or ST2I perform imperfectly in spatial understanding, due to the difficulty of 3D-wise spatial feature modeling. In this work, we consider modeling the SI2T and ST2I together under a dual learning fram… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15312v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15312v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15312v1-abstract-full" style="display: none;"> In the visual spatial understanding (VSU) area, spatial image-to-text (SI2T) and spatial text-to-image (ST2I) are two fundamental tasks that appear in dual form. Existing methods for standalone SI2T or ST2I perform imperfectly in spatial understanding, due to the difficulty of 3D-wise spatial feature modeling. In this work, we consider modeling the SI2T and ST2I together under a dual learning framework. During the dual framework, we then propose to represent the 3D spatial scene features with a novel 3D scene graph (3DSG) representation that can be shared and beneficial to both tasks. Further, inspired by the intuition that the easier 3D$\to$image and 3D$\to$text processes also exist symmetrically in the ST2I and SI2T, respectively, we propose the Spatial Dual Discrete Diffusion (SD$^3$) framework, which utilizes the intermediate features of the 3D$\to$X processes to guide the hard X$\to$3D processes, such that the overall ST2I and SI2T will benefit each other. On the visual spatial understanding dataset VSD, our system outperforms the mainstream T2I and I2T methods significantly. Further in-depth analysis reveals how our dual learning strategy advances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15312v1-abstract-full').style.display = 'none'; document.getElementById('2410.15312v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15019">arXiv:2410.15019</a> <span> [<a href="https://arxiv.org/pdf/2410.15019">pdf</a>, <a href="https://arxiv.org/format/2410.15019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Ontology Expansion for Conversational Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jinggui Liang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuxia Wu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuan Fang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+L">Lizi Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15019v1-abstract-short" style="display: inline;"> In the rapidly evolving field of conversational AI, Ontology Expansion (OnExp) is crucial for enhancing the adaptability and robustness of conversational agents. Traditional models rely on static, predefined ontologies, limiting their ability to handle new and unforeseen user needs. This survey paper provides a comprehensive review of the state-of-the-art techniques in OnExp for conversational und… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15019v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15019v1-abstract-full" style="display: none;"> In the rapidly evolving field of conversational AI, Ontology Expansion (OnExp) is crucial for enhancing the adaptability and robustness of conversational agents. Traditional models rely on static, predefined ontologies, limiting their ability to handle new and unforeseen user needs. This survey paper provides a comprehensive review of the state-of-the-art techniques in OnExp for conversational understanding. It categorizes the existing literature into three main areas: (1) New Intent Discovery, (2) New Slot-Value Discovery, and (3) Joint OnExp. By examining the methodologies, benchmarks, and challenges associated with these areas, we highlight several emerging frontiers in OnExp to improve agent performance in real-world scenarios and discuss their corresponding challenges. This survey aspires to be a foundational reference for researchers and practitioners, promoting further exploration and innovation in this crucial domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15019v1-abstract-full').style.display = 'none'; document.getElementById('2410.15019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024, code and data are available at this https URL: https://github.com/liangjinggui/Ontology-Expansion</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05767">arXiv:2410.05767</a> <span> [<a href="https://arxiv.org/pdf/2410.05767">pdf</a>, <a href="https://arxiv.org/format/2410.05767">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Grounding is All You Need? Dual Temporal Grounding for Video Dialog </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Y">You Qin</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+X">Xinze Lan</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xun Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&query=Zimmermann%2C+R">Roger Zimmermann</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+L">Lizi Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05767v2-abstract-short" style="display: inline;"> In the realm of video dialog response generation, the understanding of video content and the temporal nuances of conversation history are paramount. While a segment of current research leans heavily on large-scale pretrained visual-language models and often overlooks temporal dynamics, another delves deep into spatial-temporal relationships within videos but demands intricate object trajectory pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05767v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05767v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05767v2-abstract-full" style="display: none;"> In the realm of video dialog response generation, the understanding of video content and the temporal nuances of conversation history are paramount. While a segment of current research leans heavily on large-scale pretrained visual-language models and often overlooks temporal dynamics, another delves deep into spatial-temporal relationships within videos but demands intricate object trajectory pre-extractions and sidelines dialog temporal dynamics. This paper introduces the Dual Temporal Grounding-enhanced Video Dialog model (DTGVD), strategically designed to merge the strengths of both dominant approaches. It emphasizes dual temporal relationships by predicting dialog turn-specific temporal regions, filtering video content accordingly, and grounding responses in both video and dialog contexts. One standout feature of DTGVD is its heightened attention to chronological interplay. By recognizing and acting upon the dependencies between different dialog turns, it captures more nuanced conversational dynamics. To further bolster the alignment between video and dialog temporal dynamics, we've implemented a list-wise contrastive learning strategy. Within this framework, accurately grounded turn-clip pairings are designated as positive samples, while less precise pairings are categorized as negative. This refined classification is then funneled into our holistic end-to-end response generation mechanism. Evaluations using AVSD@DSTC-7 and AVSD@DSTC-8 datasets underscore the superiority of our methodology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05767v2-abstract-full').style.display = 'none'; document.getElementById('2410.05767v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03739">arXiv:2410.03739</a> <span> [<a href="https://arxiv.org/pdf/2410.03739">pdf</a>, <a href="https://arxiv.org/format/2410.03739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Grammar Induction from Visual, Speech and Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03739v1-abstract-short" style="display: inline;"> Grammar Induction could benefit from rich heterogeneous signals, such as text, vision, and acoustics. In the process, features from distinct modalities essentially serve complementary roles to each other. With such intuition, this work introduces a novel \emph{unsupervised visual-audio-text grammar induction} task (named \textbf{VAT-GI}), to induce the constituent grammar trees from parallel image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03739v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03739v1-abstract-full" style="display: none;"> Grammar Induction could benefit from rich heterogeneous signals, such as text, vision, and acoustics. In the process, features from distinct modalities essentially serve complementary roles to each other. With such intuition, this work introduces a novel \emph{unsupervised visual-audio-text grammar induction} task (named \textbf{VAT-GI}), to induce the constituent grammar trees from parallel images, text, and speech inputs. Inspired by the fact that language grammar natively exists beyond the texts, we argue that the text has not to be the predominant modality in grammar induction. Thus we further introduce a \emph{textless} setting of VAT-GI, wherein the task solely relies on visual and auditory inputs. To approach the task, we propose a visual-audio-text inside-outside recursive autoencoder (\textbf{VaTiora}) framework, which leverages rich modal-specific and complementary features for effective grammar parsing. Besides, a more challenging benchmark data is constructed to assess the generalization ability of VAT-GI system. Experiments on two benchmark datasets demonstrate that our proposed VaTiora system is more effective in incorporating the various multimodal signals, and also presents new state-of-the-art performance of VAT-GI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03739v1-abstract-full').style.display = 'none'; document.getElementById('2410.03739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19872">arXiv:2409.19872</a> <span> [<a href="https://arxiv.org/pdf/2409.19872">pdf</a>, <a href="https://arxiv.org/format/2409.19872">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Unified Multimodal Editing with Enhanced Knowledge Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+K">Kaihang Pan</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhaoyu Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juncheng Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qifan Yu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siliang Tang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+R">Richang Hong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qianru Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19872v3-abstract-short" style="display: inline;"> The swift advancement in Multimodal LLMs (MLLMs) also presents significant challenges for effective knowledge editing. Current methods, including intrinsic knowledge editing and external knowledge resorting, each possess strengths and weaknesses, struggling to balance the desired properties of reliability, generality, and locality when applied to MLLMs. In this paper, we propose UniKE, a novel mul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19872v3-abstract-full').style.display = 'inline'; document.getElementById('2409.19872v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19872v3-abstract-full" style="display: none;"> The swift advancement in Multimodal LLMs (MLLMs) also presents significant challenges for effective knowledge editing. Current methods, including intrinsic knowledge editing and external knowledge resorting, each possess strengths and weaknesses, struggling to balance the desired properties of reliability, generality, and locality when applied to MLLMs. In this paper, we propose UniKE, a novel multimodal editing method that establishes a unified perspective and paradigm for intrinsic knowledge editing and external knowledge resorting. Both types of knowledge are conceptualized as vectorized key-value memories, with the corresponding editing processes resembling the assimilation and accommodation phases of human cognition, conducted at the same semantic levels. Within such a unified framework, we further promote knowledge collaboration by disentangling the knowledge representations into the semantic and truthfulness spaces. Extensive experiments validate the effectiveness of our method, which ensures that the post-edit MLLM simultaneously maintains excellent reliability, generality, and locality. The code for UniKE is available at \url{https://github.com/beepkh/UniKE}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19872v3-abstract-full').style.display = 'none'; document.getElementById('2409.19872v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024 (Spotlight)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09481">arXiv:2408.09481</a> <span> [<a href="https://arxiv.org/pdf/2408.09481">pdf</a>, <a href="https://arxiv.org/format/2408.09481">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PanoSent: A Panoptic Sextuple Extraction Benchmark for Multimodal Conversational Aspect-based Sentiment Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+M">Meng Luo</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bobo Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qian Liu</a>, <a href="/search/cs?searchtype=author&query=Poria%2C+S">Soujanya Poria</a>, <a href="/search/cs?searchtype=author&query=Cambria%2C+E">Erik Cambria</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Mong-Li Lee</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+W">Wynne Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09481v2-abstract-short" style="display: inline;"> While existing Aspect-based Sentiment Analysis (ABSA) has received extensive effort and advancement, there are still gaps in defining a more holistic research target seamlessly integrating multimodality, conversation context, fine-granularity, and also covering the changing sentiment dynamics as well as cognitive causal rationales. This paper bridges the gaps by introducing a multimodal conversati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09481v2-abstract-full').style.display = 'inline'; document.getElementById('2408.09481v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09481v2-abstract-full" style="display: none;"> While existing Aspect-based Sentiment Analysis (ABSA) has received extensive effort and advancement, there are still gaps in defining a more holistic research target seamlessly integrating multimodality, conversation context, fine-granularity, and also covering the changing sentiment dynamics as well as cognitive causal rationales. This paper bridges the gaps by introducing a multimodal conversational ABSA, where two novel subtasks are proposed: 1) Panoptic Sentiment Sextuple Extraction, panoramically recognizing holder, target, aspect, opinion, sentiment, rationale from multi-turn multi-party multimodal dialogue. 2) Sentiment Flipping Analysis, detecting the dynamic sentiment transformation throughout the conversation with the causal reasons. To benchmark the tasks, we construct PanoSent, a dataset annotated both manually and automatically, featuring high quality, large scale, multimodality, multilingualism, multi-scenarios, and covering both implicit and explicit sentiment elements. To effectively address the tasks, we devise a novel Chain-of-Sentiment reasoning framework, together with a novel multimodal large language model (namely Sentica) and a paraphrase-based verification mechanism. Extensive evaluations demonstrate the superiority of our methods over strong baselines, validating the efficacy of all our proposed methods. The work is expected to open up a new era for the ABSA community, and thus all our codes and data are open at https://PanoSent.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09481v2-abstract-full').style.display = 'none'; document.getElementById('2408.09481v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM 2024 (Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09462">arXiv:2408.09462</a> <span> [<a href="https://arxiv.org/pdf/2408.09462">pdf</a>, <a href="https://arxiv.org/format/2408.09462">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> SpeechEE: A Novel Benchmark for Speech Event Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bobo Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09462v2-abstract-short" style="display: inline;"> Event extraction (EE) is a critical direction in the field of information extraction, laying an important foundation for the construction of structured knowledge bases. EE from text has received ample research and attention for years, yet there can be numerous real-world applications that require direct information acquisition from speech signals, online meeting minutes, interview summaries, press… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09462v2-abstract-full').style.display = 'inline'; document.getElementById('2408.09462v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09462v2-abstract-full" style="display: none;"> Event extraction (EE) is a critical direction in the field of information extraction, laying an important foundation for the construction of structured knowledge bases. EE from text has received ample research and attention for years, yet there can be numerous real-world applications that require direct information acquisition from speech signals, online meeting minutes, interview summaries, press releases, etc. While EE from speech has remained under-explored, this paper fills the gap by pioneering a SpeechEE, defined as detecting the event predicates and arguments from a given audio speech. To benchmark the SpeechEE task, we first construct a large-scale high-quality dataset. Based on textual EE datasets under the sentence, document, and dialogue scenarios, we convert texts into speeches through both manual real-person narration and automatic synthesis, empowering the data with diverse scenarios, languages, domains, ambiences, and speaker styles. Further, to effectively address the key challenges in the task, we tailor an E2E SpeechEE system based on the encoder-decoder architecture, where a novel Shrinking Unit module and a retrieval-aided decoding mechanism are devised. Extensive experimental results on all SpeechEE subsets demonstrate the efficacy of the proposed model, offering a strong baseline for the task. At last, being the first work on this topic, we shed light on key directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09462v2-abstract-full').style.display = 'none'; document.getElementById('2408.09462v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08632">arXiv:2408.08632</a> <span> [<a href="https://arxiv.org/pdf/2408.08632">pdf</a>, <a href="https://arxiv.org/format/2408.08632">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Benchmarks of Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Weiheng Lu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M">Meng Luo</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+M">Ming Dai</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Min Xia</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yizhang Jin</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Zhenye Gan</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+D">Ding Qi</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chaoyou Fu</a>, <a href="/search/cs?searchtype=author&query=Tai%2C+Y">Ying Tai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wankou Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yabiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengjie Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08632v2-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) are gaining increasing popularity in both academia and industry due to their remarkable performance in various applications such as visual question answering, visual perception, understanding, and reasoning. Over the past few years, significant efforts have been made to examine MLLMs from multiple perspectives. This paper presents a comprehensive review of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08632v2-abstract-full').style.display = 'inline'; document.getElementById('2408.08632v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08632v2-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) are gaining increasing popularity in both academia and industry due to their remarkable performance in various applications such as visual question answering, visual perception, understanding, and reasoning. Over the past few years, significant efforts have been made to examine MLLMs from multiple perspectives. This paper presents a comprehensive review of 200 benchmarks and evaluations for MLLMs, focusing on (1)perception and understanding, (2)cognition and reasoning, (3)specific domains, (4)key capabilities, and (5)other modalities. Finally, we discuss the limitations of the current evaluation methods for MLLMs and explore promising future directions. Our key argument is that evaluation should be regarded as a crucial discipline to support the development of MLLMs better. For more details, please visit our GitHub repository: https://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08632v2-abstract-full').style.display = 'none'; document.getElementById('2408.08632v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07009">arXiv:2408.07009</a> <span> [<a href="https://arxiv.org/pdf/2408.07009">pdf</a>, <a href="https://arxiv.org/format/2408.07009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Imagen 3 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Imagen-Team-Google"> Imagen-Team-Google</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Baldridge%2C+J">Jason Baldridge</a>, <a href="/search/cs?searchtype=author&query=Bauer%2C+J">Jakob Bauer</a>, <a href="/search/cs?searchtype=author&query=Bhutani%2C+M">Mukul Bhutani</a>, <a href="/search/cs?searchtype=author&query=Brichtova%2C+N">Nicole Brichtova</a>, <a href="/search/cs?searchtype=author&query=Bunner%2C+A">Andrew Bunner</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K">Kelvin Chan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yichang Chen</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuqing Du</a>, <a href="/search/cs?searchtype=author&query=Eaton-Rosen%2C+Z">Zach Eaton-Rosen</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hongliang Fei</a>, <a href="/search/cs?searchtype=author&query=de+Freitas%2C+N">Nando de Freitas</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yilin Gao</a>, <a href="/search/cs?searchtype=author&query=Gladchenko%2C+E">Evgeny Gladchenko</a>, <a href="/search/cs?searchtype=author&query=Colmenarejo%2C+S+G">Sergio G贸mez Colmenarejo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Mandy Guo</a>, <a href="/search/cs?searchtype=author&query=Haig%2C+A">Alex Haig</a>, <a href="/search/cs?searchtype=author&query=Hawkins%2C+W">Will Hawkins</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huilian Huang</a>, <a href="/search/cs?searchtype=author&query=Igwe%2C+T+P">Tobenna Peter Igwe</a>, <a href="/search/cs?searchtype=author&query=Kaplanis%2C+C">Christos Kaplanis</a>, <a href="/search/cs?searchtype=author&query=Khodadadeh%2C+S">Siavash Khodadadeh</a> , et al. (227 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07009v1-abstract-short" style="display: inline;"> We introduce Imagen 3, a latent diffusion model that generates high quality images from text prompts. We describe our quality and responsibility evaluations. Imagen 3 is preferred over other state-of-the-art (SOTA) models at the time of evaluation. In addition, we discuss issues around safety and representation, as well as methods we used to minimize the potential harm of our models. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07009v1-abstract-full" style="display: none;"> We introduce Imagen 3, a latent diffusion model that generates high quality images from text prompts. We describe our quality and responsibility evaluations. Imagen 3 is preferred over other state-of-the-art (SOTA) models at the time of evaluation. In addition, we discuss issues around safety and representation, as well as methods we used to minimize the potential harm of our models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07009v1-abstract-full').style.display = 'none'; document.getElementById('2408.07009v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21534">arXiv:2407.21534</a> <span> [<a href="https://arxiv.org/pdf/2407.21534">pdf</a>, <a href="https://arxiv.org/format/2407.21534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ControlMLLM: Training-Free Visual Prompt Learning for Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mingrui Wu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xinyue Cai</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jiayi Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiale Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+O">Oucheng Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+G">Gen Luo</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+G">Guannan Jiang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiaoshuai Sun</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+R">Rongrong Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21534v3-abstract-short" style="display: inline;"> In this work, we propose a training-free method to inject visual referring into Multimodal Large Language Models (MLLMs) through learnable visual token optimization. We observe the relationship between text prompt tokens and visual tokens in MLLMs, where attention layers model the connection between them. Our approach involves adjusting visual tokens from the MLP output during inference, controlli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21534v3-abstract-full').style.display = 'inline'; document.getElementById('2407.21534v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21534v3-abstract-full" style="display: none;"> In this work, we propose a training-free method to inject visual referring into Multimodal Large Language Models (MLLMs) through learnable visual token optimization. We observe the relationship between text prompt tokens and visual tokens in MLLMs, where attention layers model the connection between them. Our approach involves adjusting visual tokens from the MLP output during inference, controlling which text prompt tokens attend to which visual tokens. We optimize a learnable visual token based on an energy function, enhancing the strength of referential regions in the attention map. This enables detailed region description and reasoning without the need for substantial training costs or model retraining. Our method offers a promising direction for integrating referential abilities into MLLMs. Our method support referring with box, mask, scribble and point. The results demonstrate that our method exhibits controllability and interpretability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21534v3-abstract-full').style.display = 'none'; document.getElementById('2407.21534v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024; Code:https://github.com/mrwu-mac/ControlMLLM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04801">arXiv:2407.04801</a> <span> [<a href="https://arxiv.org/pdf/2407.04801">pdf</a>, <a href="https://arxiv.org/format/2407.04801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Structured Sentiment Analysis as Latent Dependency Graph Parsing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chengjie Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bobo Li</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fei Li</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+C">Chong Teng</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Donghong Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04801v1-abstract-short" style="display: inline;"> Structured Sentiment Analysis (SSA) was cast as a problem of bi-lexical dependency graph parsing by prior studies. Multiple formulations have been proposed to construct the graph, which share several intrinsic drawbacks: (1) The internal structures of spans are neglected, thus only the boundary tokens of spans are used for relation prediction and span recognition, thus hindering the model's expres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04801v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04801v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04801v1-abstract-full" style="display: none;"> Structured Sentiment Analysis (SSA) was cast as a problem of bi-lexical dependency graph parsing by prior studies. Multiple formulations have been proposed to construct the graph, which share several intrinsic drawbacks: (1) The internal structures of spans are neglected, thus only the boundary tokens of spans are used for relation prediction and span recognition, thus hindering the model's expressiveness; (2) Long spans occupy a significant proportion in the SSA datasets, which further exacerbates the problem of internal structure neglect. In this paper, we treat the SSA task as a dependency parsing task on partially-observed dependency trees, regarding flat spans without determined tree annotations as latent subtrees to consider internal structures of spans. We propose a two-stage parsing method and leverage TreeCRFs with a novel constrained inside algorithm to model latent structures explicitly, which also takes advantages of joint scoring graph arcs and headed spans for global optimization and inference. Results of extensive experiments on five benchmark datasets reveal that our method performs significantly better than all previous bi-lexical methods, achieving new state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04801v1-abstract-full').style.display = 'none'; document.getElementById('2407.04801v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03026">arXiv:2407.03026</a> <span> [<a href="https://arxiv.org/pdf/2407.03026">pdf</a>, <a href="https://arxiv.org/format/2407.03026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Qifusion-Net: Layer-adapted Stream/Non-stream Model for End-to-End Multi-Accent Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinming Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jingyi Fang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yuanzhong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaoxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Haojun Fei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03026v1-abstract-short" style="display: inline;"> Currently, end-to-end (E2E) speech recognition methods have achieved promising performance. However, auto speech recognition (ASR) models still face challenges in recognizing multi-accent speech accurately. We propose a layer-adapted fusion (LAF) model, called Qifusion-Net, which does not require any prior knowledge about the target accent. Based on dynamic chunk strategy, our approach enables str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03026v1-abstract-full').style.display = 'inline'; document.getElementById('2407.03026v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03026v1-abstract-full" style="display: none;"> Currently, end-to-end (E2E) speech recognition methods have achieved promising performance. However, auto speech recognition (ASR) models still face challenges in recognizing multi-accent speech accurately. We propose a layer-adapted fusion (LAF) model, called Qifusion-Net, which does not require any prior knowledge about the target accent. Based on dynamic chunk strategy, our approach enables streaming decoding and can extract frame-level acoustic feature, facilitating fine-grained information fusion. Experiment results demonstrate that our proposed methods outperform the baseline with relative reductions of 22.1$\%$ and 17.2$\%$ in character error rate (CER) across multi accent test datasets on KeSpeech and MagicData-RMAC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03026v1-abstract-full').style.display = 'none'; document.getElementById('2407.03026v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accpeted by interspeech 2014, 5 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19389">arXiv:2406.19389</a> <span> [<a href="https://arxiv.org/pdf/2406.19389">pdf</a>, <a href="https://arxiv.org/format/2406.19389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OMG-LLaVA: Bridging Image-level, Object-level, Pixel-level Reasoning and Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Haobo Yuan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shunping Ji</a>, <a href="/search/cs?searchtype=author&query=Loy%2C+C+C">Chen Change Loy</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shuicheng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19389v2-abstract-short" style="display: inline;"> Current universal segmentation methods demonstrate strong capabilities in pixel-level image and video understanding. However, they lack reasoning abilities and cannot be controlled via text instructions. In contrast, large vision-language multimodal models exhibit powerful vision-based conversation and reasoning capabilities but lack pixel-level understanding and have difficulty accepting visual p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19389v2-abstract-full').style.display = 'inline'; document.getElementById('2406.19389v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19389v2-abstract-full" style="display: none;"> Current universal segmentation methods demonstrate strong capabilities in pixel-level image and video understanding. However, they lack reasoning abilities and cannot be controlled via text instructions. In contrast, large vision-language multimodal models exhibit powerful vision-based conversation and reasoning capabilities but lack pixel-level understanding and have difficulty accepting visual prompts for flexible user interaction. This paper proposes OMG-LLaVA, a new and elegant framework combining powerful pixel-level vision understanding with reasoning abilities. It can accept various visual and text prompts for flexible user interaction. Specifically, we use a universal segmentation method as the visual encoder, integrating image information, perception priors, and visual prompts into visual tokens provided to the LLM. The LLM is responsible for understanding the user's text instructions and providing text responses and pixel-level segmentation results based on the visual information. We propose perception prior embedding to better integrate perception priors with image features. OMG-LLaVA achieves image-level, object-level, and pixel-level reasoning and understanding in a single model, matching or surpassing the performance of specialized methods on multiple benchmarks. Rather than using LLM to connect each specialist, our work aims at end-to-end training on one encoder, one decoder, and one LLM. The code and model have been released for further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19389v2-abstract-full').style.display = 'none'; document.getElementById('2406.19389v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS-2024. Project page: https://lxtgh.github.io/project/omg_llava/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19255">arXiv:2406.19255</a> <span> [<a href="https://arxiv.org/pdf/2406.19255">pdf</a>, <a href="https://arxiv.org/format/2406.19255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TPAMI.2024.3393452">10.1109/TPAMI.2024.3393452 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Video-Language Representations with Structural Spatio-Temporal Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shuicheng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19255v1-abstract-short" style="display: inline;"> While pre-training large-scale video-language models (VLMs) has shown remarkable potential for various downstream video-language tasks, existing VLMs can still suffer from certain commonly seen limitations, e.g., coarse-grained cross-modal aligning , under-modeling of temporal dynamics, detached video-language view. In this work, we target enhancing VLMs with a fine-grained structural spatio-tempo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19255v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19255v1-abstract-full" style="display: none;"> While pre-training large-scale video-language models (VLMs) has shown remarkable potential for various downstream video-language tasks, existing VLMs can still suffer from certain commonly seen limitations, e.g., coarse-grained cross-modal aligning , under-modeling of temporal dynamics, detached video-language view. In this work, we target enhancing VLMs with a fine-grained structural spatio-temporal alignment learning method (namely Finsta). First of all, we represent the input texts and videos with fine-grained scene graph (SG) structures, both of which are further unified into a holistic SG (HSG) for bridging two modalities. Then, an SG-based framework is built, where the textual SG (TSG) is encoded with a graph Transformer, while the video dynamic SG (DSG) and the HSG are modeled with a novel recurrent graph Transformer for spatial and temporal feature propagation. A spatial-temporal Gaussian differential graph Transformer is further devised to strengthen the sense of the changes in objects across spatial and temporal dimensions. Next, based on the fine-grained structural features of TSG and DSG, we perform object-centered spatial alignment and predicate-centered temporal alignment respectively, enhancing the video-language grounding in both the spatiality and temporality. We design our method as a plug&play system, which can be integrated into existing well-trained VLMs for further representation augmentation, without training from scratch or relying on SG annotations in downstream applications. On 6 representative VL modeling tasks over 12 datasets in both standard and long-form video scenarios, Finsta consistently improves the existing 13 strong-performing VLMs persistently, and refreshes the current state-of-the-art end task performance significantly in both the fine-tuning and zero-shot settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19255v1-abstract-full').style.display = 'none'; document.getElementById('2406.19255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE TPAMI 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> [J].IEEE Transactions on Pattern Analysis and Machine Intelligence, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15177">arXiv:2406.15177</a> <span> [<a href="https://arxiv.org/pdf/2406.15177">pdf</a>, <a href="https://arxiv.org/format/2406.15177">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> EmpathyEar: An Open-source Avatar Multimodal Empathetic Chatbot </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+L">Lizi Liao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qian Liu</a>, <a href="/search/cs?searchtype=author&query=Cambria%2C+E">Erik Cambria</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15177v1-abstract-short" style="display: inline;"> This paper introduces EmpathyEar, a pioneering open-source, avatar-based multimodal empathetic chatbot, to fill the gap in traditional text-only empathetic response generation (ERG) systems. Leveraging the advancements of a large language model, combined with multimodal encoders and generators, EmpathyEar supports user inputs in any combination of text, sound, and vision, and produces multimodal e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15177v1-abstract-full').style.display = 'inline'; document.getElementById('2406.15177v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15177v1-abstract-full" style="display: none;"> This paper introduces EmpathyEar, a pioneering open-source, avatar-based multimodal empathetic chatbot, to fill the gap in traditional text-only empathetic response generation (ERG) systems. Leveraging the advancements of a large language model, combined with multimodal encoders and generators, EmpathyEar supports user inputs in any combination of text, sound, and vision, and produces multimodal empathetic responses, offering users, not just textual responses but also digital avatars with talking faces and synchronized speeches. A series of emotion-aware instruction-tuning is performed for comprehensive emotional understanding and generation capabilities. In this way, EmpathyEar provides users with responses that achieve a deeper emotional resonance, closely emulating human-like empathy. The system paves the way for the next emotional intelligence, for which we open-source the code for public access. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15177v1-abstract-full').style.display = 'none'; document.getElementById('2406.15177v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 Demonstration Paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05127">arXiv:2406.05127</a> <span> [<a href="https://arxiv.org/pdf/2406.05127">pdf</a>, <a href="https://arxiv.org/format/2406.05127">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Semantic Equivalence of Tokenization in Multimodal LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jiayi Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shuicheng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05127v3-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have demonstrated exceptional capabilities in processing vision-language tasks. One of the crux of MLLMs lies in vision tokenization, which involves efficiently transforming input visual signals into feature representations that are most beneficial for LLMs. However, existing vision tokenizers, essential for semantic alignment between vision and language, r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05127v3-abstract-full').style.display = 'inline'; document.getElementById('2406.05127v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05127v3-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have demonstrated exceptional capabilities in processing vision-language tasks. One of the crux of MLLMs lies in vision tokenization, which involves efficiently transforming input visual signals into feature representations that are most beneficial for LLMs. However, existing vision tokenizers, essential for semantic alignment between vision and language, remain problematic. Existing methods aggressively fragment visual input, corrupting the visual semantic integrity. To address this, this paper proposes a novel dynamic Semantic-Equivalent Vision Tokenizer (SeTok), which groups visual features into semantic units via a dynamic clustering algorithm, flexibly determining the number of tokens based on image complexity. The resulting vision tokens effectively preserve semantic integrity and capture both low-frequency and high-frequency visual features. The proposed MLLM (Setokim) equipped with SeTok significantly demonstrates superior performance across various tasks, as evidenced by our experimental results. The project page is at https://chocowu.github.io/SeTok-web/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05127v3-abstract-full').style.display = 'none'; document.getElementById('2406.05127v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report. The project page: https://chocowu.github.io/SeTok-web/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03701">arXiv:2406.03701</a> <span> [<a href="https://arxiv.org/pdf/2406.03701">pdf</a>, <a href="https://arxiv.org/format/2406.03701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Recognizing Everything from All Modalities at Once: Grounded Multimodal Universal Information Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yixin Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03701v2-abstract-short" style="display: inline;"> In the field of information extraction (IE), tasks across a wide range of modalities and their combinations have been traditionally studied in isolation, leaving a gap in deeply recognizing and analyzing cross-modal information. To address this, this work for the first time introduces the concept of grounded Multimodal Universal Information Extraction (MUIE), providing a unified task framework to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03701v2-abstract-full').style.display = 'inline'; document.getElementById('2406.03701v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03701v2-abstract-full" style="display: none;"> In the field of information extraction (IE), tasks across a wide range of modalities and their combinations have been traditionally studied in isolation, leaving a gap in deeply recognizing and analyzing cross-modal information. To address this, this work for the first time introduces the concept of grounded Multimodal Universal Information Extraction (MUIE), providing a unified task framework to analyze any IE tasks over various modalities, along with their fine-grained groundings. To tackle MUIE, we tailor a multimodal large language model (MLLM), Reamo, capable of extracting and grounding information from all modalities, i.e., recognizing everything from all modalities at once. Reamo is updated via varied tuning strategies, equipping it with powerful capabilities for information recognition and fine-grained multimodal grounding. To address the absence of a suitable benchmark for grounded MUIE, we curate a high-quality, diverse, and challenging test set, which encompasses IE tasks across 9 common modality combinations with the corresponding multimodal groundings. The extensive comparison of Reamo with existing MLLMs integrated into pipeline approaches demonstrates its advantages across all evaluation dimensions, establishing a strong benchmark for the follow-up research. Our resources are publicly released at https://haofei.vip/MUIE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03701v2-abstract-full').style.display = 'none'; document.getElementById('2406.03701v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18357">arXiv:2405.18357</a> <span> [<a href="https://arxiv.org/pdf/2405.18357">pdf</a>, <a href="https://arxiv.org/format/2405.18357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Faithful Logical Reasoning via Symbolic Chain-of-Thought </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jundong Xu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Liangming Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qian Liu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Mong-Li Lee</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+W">Wynne Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18357v2-abstract-short" style="display: inline;"> While the recent Chain-of-Thought (CoT) technique enhances the reasoning ability of large language models (LLMs) with the theory of mind, it might still struggle in handling logical reasoning that relies much on symbolic expressions and rigid deducing rules. To strengthen the logical reasoning capability of LLMs, we propose a novel Symbolic Chain-of-Thought, namely SymbCoT, a fully LLM-based frame… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18357v2-abstract-full').style.display = 'inline'; document.getElementById('2405.18357v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18357v2-abstract-full" style="display: none;"> While the recent Chain-of-Thought (CoT) technique enhances the reasoning ability of large language models (LLMs) with the theory of mind, it might still struggle in handling logical reasoning that relies much on symbolic expressions and rigid deducing rules. To strengthen the logical reasoning capability of LLMs, we propose a novel Symbolic Chain-of-Thought, namely SymbCoT, a fully LLM-based framework that integrates symbolic expressions and logic rules with CoT prompting. Technically, building upon an LLM, SymbCoT 1) first translates the natural language context into the symbolic format, and then 2) derives a step-by-step plan to solve the problem with symbolic logical rules, 3) followed by a verifier to check the translation and reasoning chain. Via thorough evaluations on 5 standard datasets with both First-Order Logic and Constraint Optimization symbolic expressions, SymbCoT shows striking improvements over the CoT method consistently, meanwhile refreshing the current state-of-the-art performances. We further demonstrate that our system advances in more faithful, flexible, and explainable logical reasoning. To our knowledge, this is the first to combine symbolic expressions and rules into CoT for logical reasoning with LLMs. Code is open at https://github.com/Aiden0526/SymbCoT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18357v2-abstract-full').style.display = 'none'; document.getElementById('2405.18357v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2024 (main proceeding)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16759">arXiv:2405.16759</a> <span> [<a href="https://arxiv.org/pdf/2405.16759">pdf</a>, <a href="https://arxiv.org/format/2405.16759">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Greedy Growing Enables High-Resolution Pixel-Based Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vasconcelos%2C+C+N">Cristina N. Vasconcelos</a>, <a href="/search/cs?searchtype=author&query=Rashwan%2C+A">Abdullah Rashwan</a>, <a href="/search/cs?searchtype=author&query=Waters%2C+A">Austin Waters</a>, <a href="/search/cs?searchtype=author&query=Walker%2C+T">Trevor Walker</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Keyang Xu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jimmy Yan</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+R">Rui Qian</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shixin Luo</a>, <a href="/search/cs?searchtype=author&query=Parekh%2C+Z">Zarana Parekh</a>, <a href="/search/cs?searchtype=author&query=Bunner%2C+A">Andrew Bunner</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hongliang Fei</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+R">Roopal Garg</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Mandy Guo</a>, <a href="/search/cs?searchtype=author&query=Kajic%2C+I">Ivana Kajic</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yeqing Li</a>, <a href="/search/cs?searchtype=author&query=Nandwani%2C+H">Henna Nandwani</a>, <a href="/search/cs?searchtype=author&query=Pont-Tuset%2C+J">Jordi Pont-Tuset</a>, <a href="/search/cs?searchtype=author&query=Onoe%2C+Y">Yasumasa Onoe</a>, <a href="/search/cs?searchtype=author&query=Rosston%2C+S">Sarah Rosston</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Su Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenlei Zhou</a>, <a href="/search/cs?searchtype=author&query=Swersky%2C+K">Kevin Swersky</a>, <a href="/search/cs?searchtype=author&query=Fleet%2C+D+J">David J. Fleet</a>, <a href="/search/cs?searchtype=author&query=Baldridge%2C+J+M">Jason M. Baldridge</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+O">Oliver Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16759v1-abstract-short" style="display: inline;"> We address the long-standing problem of how to learn effective pixel-based image diffusion models at scale, introducing a remarkably simple greedy growing method for stable training of large-scale, high-resolution models. without the needs for cascaded super-resolution components. The key insight stems from careful pre-training of core components, namely, those responsible for text-to-image alignm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16759v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16759v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16759v1-abstract-full" style="display: none;"> We address the long-standing problem of how to learn effective pixel-based image diffusion models at scale, introducing a remarkably simple greedy growing method for stable training of large-scale, high-resolution models. without the needs for cascaded super-resolution components. The key insight stems from careful pre-training of core components, namely, those responsible for text-to-image alignment {\it vs.} high-resolution rendering. We first demonstrate the benefits of scaling a {\it Shallow UNet}, with no down(up)-sampling enc(dec)oder. Scaling its deep core layers is shown to improve alignment, object structure, and composition. Building on this core model, we propose a greedy algorithm that grows the architecture into high-resolution end-to-end models, while preserving the integrity of the pre-trained representation, stabilizing training, and reducing the need for large high-resolution datasets. This enables a single stage model capable of generating high-resolution images without the need of a super-resolution cascade. Our key results rely on public datasets and show that we are able to train non-cascaded models up to 8B parameters with no further regularization schemes. Vermeer, our full pipeline model trained with internal datasets to produce 1024x1024 images, without cascades, is preferred by 44.0% vs. 21.4% human evaluators over SDXL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16759v1-abstract-full').style.display = 'none'; document.getElementById('2405.16759v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15452">arXiv:2405.15452</a> <span> [<a href="https://arxiv.org/pdf/2405.15452">pdf</a>, <a href="https://arxiv.org/format/2405.15452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Logical Rules in Knowledge Editing: A Cherry on the Top </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+K">Keyuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Ali%2C+M+A">Muhammad Asif Ali</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+G">Gang Lin</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yuxuan Zhai</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Haoyang Fei</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Ke Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lu Yu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lijie Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15452v2-abstract-short" style="display: inline;"> Multi-hop Question Answering (MQA) under knowledge editing (KE) is a key challenge in Large Language Models (LLMs). While best-performing solutions in this domain use a plan and solve paradigm to split a question into sub-questions followed by response generation, we claim that this approach is sub-optimal as it fails for hard to decompose questions, and it does not explicitly cater to correlated… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15452v2-abstract-full').style.display = 'inline'; document.getElementById('2405.15452v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15452v2-abstract-full" style="display: none;"> Multi-hop Question Answering (MQA) under knowledge editing (KE) is a key challenge in Large Language Models (LLMs). While best-performing solutions in this domain use a plan and solve paradigm to split a question into sub-questions followed by response generation, we claim that this approach is sub-optimal as it fails for hard to decompose questions, and it does not explicitly cater to correlated knowledge updates resulting as a consequence of knowledge edits. This has a detrimental impact on the overall consistency of the updated knowledge. To address these issues, in this paper, we propose a novel framework named RULE-KE, i.e., RULE based Knowledge Editing, which is a cherry on the top for augmenting the performance of all existing MQA methods under KE. Specifically, RULE-KE leverages rule discovery to discover a set of logical rules. Then, it uses these discovered rules to update knowledge about facts highly correlated with the edit. Experimental evaluation using existing and newly curated datasets (i.e., RKE-EVAL) shows that RULE-KE helps augment both performances of parameter-based and memory-based solutions up to 92% and 112.9%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15452v2-abstract-full').style.display = 'none'; document.getElementById('2405.15452v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.12564">arXiv:2405.12564</a> <span> [<a href="https://arxiv.org/pdf/2405.12564">pdf</a>, <a href="https://arxiv.org/format/2405.12564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> ProtT3: Protein-to-Text Generation for Text-based Protein Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">An Zhang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+E">Enzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=Kawaguchi%2C+K">Kenji Kawaguchi</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.12564v1-abstract-short" style="display: inline;"> Language Models (LMs) excel in understanding textual descriptions of proteins, as evident in biomedical question-answering tasks. However, their capability falters with raw protein data, such as amino acid sequences, due to a deficit in pretraining on such data. Conversely, Protein Language Models (PLMs) can understand and convert protein data into high-quality representations, but struggle to pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12564v1-abstract-full').style.display = 'inline'; document.getElementById('2405.12564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.12564v1-abstract-full" style="display: none;"> Language Models (LMs) excel in understanding textual descriptions of proteins, as evident in biomedical question-answering tasks. However, their capability falters with raw protein data, such as amino acid sequences, due to a deficit in pretraining on such data. Conversely, Protein Language Models (PLMs) can understand and convert protein data into high-quality representations, but struggle to process texts. To address their limitations, we introduce ProtT3, a framework for Protein-to-Text Generation for Text-based Protein Understanding. ProtT3 empowers an LM to understand protein sequences of amino acids by incorporating a PLM as its protein understanding module, enabling effective protein-to-text generation. This collaboration between PLM and LM is facilitated by a cross-modal projector (i.e., Q-Former) that bridges the modality gap between the PLM's representation space and the LM's input space. Unlike previous studies focusing on protein property prediction and protein-text retrieval, we delve into the largely unexplored field of protein-to-text generation. To facilitate comprehensive benchmarks and promote future research, we establish quantitative evaluations for protein-text modeling tasks, including protein captioning, protein question-answering, and protein-text retrieval. Our experiments show that ProtT3 substantially surpasses current baselines, with ablation studies further highlighting the efficacy of its core components. Our code is available at https://github.com/acharkq/ProtT3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12564v1-abstract-full').style.display = 'none'; document.getElementById('2405.12564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024, 9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00492">arXiv:2404.00492</a> <span> [<a href="https://arxiv.org/pdf/2404.00492">pdf</a>, <a href="https://arxiv.org/format/2404.00492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-hop Question Answering under Temporal Knowledge Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+K">Keyuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+G">Gang Lin</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Haoyang Fei</a>, <a href="/search/cs?searchtype=author&query=zhai%2C+Y">Yuxuan zhai</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lu Yu</a>, <a href="/search/cs?searchtype=author&query=Ali%2C+M+A">Muhammad Asif Ali</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lijie Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00492v1-abstract-short" style="display: inline;"> Multi-hop question answering (MQA) under knowledge editing (KE) has garnered significant attention in the era of large language models. However, existing models for MQA under KE exhibit poor performance when dealing with questions containing explicit temporal contexts. To address this limitation, we propose a novel framework, namely TEMPoral knowLEdge augmented Multi-hop Question Answering (TEMPLE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00492v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00492v1-abstract-full" style="display: none;"> Multi-hop question answering (MQA) under knowledge editing (KE) has garnered significant attention in the era of large language models. However, existing models for MQA under KE exhibit poor performance when dealing with questions containing explicit temporal contexts. To address this limitation, we propose a novel framework, namely TEMPoral knowLEdge augmented Multi-hop Question Answering (TEMPLE-MQA). Unlike previous methods, TEMPLE-MQA first constructs a time-aware graph (TAG) to store edit knowledge in a structured manner. Then, through our proposed inference path, structural retrieval, and joint reasoning stages, TEMPLE-MQA effectively discerns temporal contexts within the question query. Experiments on benchmark datasets demonstrate that TEMPLE-MQA significantly outperforms baseline models. Additionally, we contribute a new dataset, namely TKEMQA, which serves as the inaugural benchmark tailored specifically for MQA with temporal scopes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00492v1-abstract-full').style.display = 'none'; document.getElementById('2404.00492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15776">arXiv:2403.15776</a> <span> [<a href="https://arxiv.org/pdf/2403.15776">pdf</a>, <a href="https://arxiv.org/format/2403.15776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Modeling Unified Semantic Discourse Structure for High-quality Headline Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minghui Xu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fei Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Rui Sun</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+C">Chong Teng</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Donghong Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15776v1-abstract-short" style="display: inline;"> Headline generation aims to summarize a long document with a short, catchy title that reflects the main idea. This requires accurately capturing the core document semantics, which is challenging due to the lengthy and background information-rich na ture of the texts. In this work, We propose using a unified semantic discourse structure (S3) to represent document semantics, achieved by combining do… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15776v1-abstract-full').style.display = 'inline'; document.getElementById('2403.15776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15776v1-abstract-full" style="display: none;"> Headline generation aims to summarize a long document with a short, catchy title that reflects the main idea. This requires accurately capturing the core document semantics, which is challenging due to the lengthy and background information-rich na ture of the texts. In this work, We propose using a unified semantic discourse structure (S3) to represent document semantics, achieved by combining document-level rhetorical structure theory (RST) trees with sentence-level abstract meaning representation (AMR) graphs to construct S3 graphs. The hierarchical composition of sentence, clause, and word intrinsically characterizes the semantic meaning of the overall document. We then develop a headline generation framework, in which the S3 graphs are encoded as contextual features. To consolidate the efficacy of S3 graphs, we further devise a hierarchical structure pruning mechanism to dynamically screen the redundant and nonessential nodes within the graph. Experimental results on two headline generation datasets demonstrate that our method outperforms existing state-of-art methods consistently. Our work can be instructive for a broad range of document modeling tasks, more than headline or summarization generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15776v1-abstract-full').style.display = 'none'; document.getElementById('2403.15776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.03605">arXiv:2403.03605</a> <span> [<a href="https://arxiv.org/pdf/2403.03605">pdf</a>, <a href="https://arxiv.org/format/2403.03605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Multi-time-step coupling of peridynamics and classical continuum mechanics for dynamic brittle fracture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiandong%2C+Z">Zhong Jiandong</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Han Fei</a>, <a href="/search/cs?searchtype=author&query=Zongliang%2C+D">Du Zongliang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guo Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.03605v1-abstract-short" style="display: inline;"> Peridynamics (PD), as a nonlocal theory, is well-suited for solving problems with discontinuities, such as cracks. However, the nonlocal effect of peridynamics makes it computationally expensive for dynamic fracture problems in large-scale engineering applications. As an alternative, this study proposes a multi-time-step (MTS) coupling model of PD and classical continuum mechanics (CCM) based on t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03605v1-abstract-full').style.display = 'inline'; document.getElementById('2403.03605v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.03605v1-abstract-full" style="display: none;"> Peridynamics (PD), as a nonlocal theory, is well-suited for solving problems with discontinuities, such as cracks. However, the nonlocal effect of peridynamics makes it computationally expensive for dynamic fracture problems in large-scale engineering applications. As an alternative, this study proposes a multi-time-step (MTS) coupling model of PD and classical continuum mechanics (CCM) based on the Arlequin framework. Peridynamics is applied to the fracture domain of the structure, while continuum mechanics is applied to the rest of the structure. The MTS method enables the peridynamic model to be solved at a small time step and the continuum mechanical model is solved at a larger time step. Consequently, higher computational efficiency is achieved for the fracture domain of the structure while ensuring computational accuracy, and this coupling method can be easily applied to large-scale engineering fracture problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03605v1-abstract-full').style.display = 'none'; document.getElementById('2403.03605v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 pages, 17 figures, 81 conferences</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.03486">arXiv:2403.03486</a> <span> [<a href="https://arxiv.org/pdf/2403.03486">pdf</a>, <a href="https://arxiv.org/format/2403.03486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> PhenoAuth: A Novel PUF-Phenotype-based Authentication Protocol for IoT Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hongming Fei</a>, <a href="/search/cs?searchtype=author&query=Millwood%2C+O">Owen Millwood</a>, <a href="/search/cs?searchtype=author&query=Prosanta%2C+G">Gope Prosanta</a>, <a href="/search/cs?searchtype=author&query=Miskelly%2C+J">Jack Miskelly</a>, <a href="/search/cs?searchtype=author&query=Sikdar%2C+B">Biplab Sikdar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.03486v1-abstract-short" style="display: inline;"> Physical Unclonable Functions (PUFs) have been shown to be a highly promising solution for enabling high security systems tailored for low-power devices. Commonly, PUFs are utilised to generate cryptographic keys on-the-fly, replacing the need to store keys in vulnerable, non-volatile memories. Due to the physical nature of PUFs, environmental variations cause noise, manifesting themselves as erro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03486v1-abstract-full').style.display = 'inline'; document.getElementById('2403.03486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.03486v1-abstract-full" style="display: none;"> Physical Unclonable Functions (PUFs) have been shown to be a highly promising solution for enabling high security systems tailored for low-power devices. Commonly, PUFs are utilised to generate cryptographic keys on-the-fly, replacing the need to store keys in vulnerable, non-volatile memories. Due to the physical nature of PUFs, environmental variations cause noise, manifesting themselves as errors which are apparent in the initial PUF measurements. This necessitates expensive active error correction techniques which can run counter to the goal of lightweight security. ML-based techniques for authenticating noisy PUF measurements were explored as an alternative to error correction techniques, bringing about the concept of a PUF Phenotype, where PUF identity is considered as a structure agnostic representation of the PUF, with relevant noise encoding. This work proposes a full noise-tolerant authentication protocol based on the PUF Phenotype concept and methodology for an Internet-of-Things (IoT) network, demonstrating mutual authentication and forward secrecy in a setting suitable for device-to-device communication. Upon conducting security and performance analyses, it is evident that our proposed scheme demonstrates resilience against various attacks compared to the currently existing PUF protocols. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03486v1-abstract-full').style.display = 'none'; document.getElementById('2403.03486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68M25 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.8 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.00464">arXiv:2403.00464</a> <span> [<a href="https://arxiv.org/pdf/2403.00464">pdf</a>, <a href="https://arxiv.org/format/2403.00464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Attacking Delay-based PUFs with Minimal Adversary Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hongming Fei</a>, <a href="/search/cs?searchtype=author&query=Millwood%2C+O">Owen Millwood</a>, <a href="/search/cs?searchtype=author&query=Gope%2C+P">Prosanta Gope</a>, <a href="/search/cs?searchtype=author&query=Miskelly%2C+J">Jack Miskelly</a>, <a href="/search/cs?searchtype=author&query=Sikdar%2C+B">Biplab Sikdar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.00464v1-abstract-short" style="display: inline;"> Physically Unclonable Functions (PUFs) provide a streamlined solution for lightweight device authentication. Delay-based Arbiter PUFs, with their ease of implementation and vast challenge space, have received significant attention; however, they are not immune to modelling attacks that exploit correlations between their inputs and outputs. Research is therefore polarized between developing modelli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00464v1-abstract-full').style.display = 'inline'; document.getElementById('2403.00464v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.00464v1-abstract-full" style="display: none;"> Physically Unclonable Functions (PUFs) provide a streamlined solution for lightweight device authentication. Delay-based Arbiter PUFs, with their ease of implementation and vast challenge space, have received significant attention; however, they are not immune to modelling attacks that exploit correlations between their inputs and outputs. Research is therefore polarized between developing modelling-resistant PUFs and devising machine learning attacks against them. This dichotomy often results in exaggerated concerns and overconfidence in PUF security, primarily because there lacks a universal tool to gauge a PUF's security. In many scenarios, attacks require additional information, such as PUF type or configuration parameters. Alarmingly, new PUFs are often branded `secure' if they lack a specific attack model upon introduction. To impartially assess the security of delay-based PUFs, we present a generic framework featuring a Mixture-of-PUF-Experts (MoPE) structure for mounting attacks on various PUFs with minimal adversarial knowledge, which provides a way to compare their performance fairly and impartially. We demonstrate the capability of our model to attack different PUF types, including the first successful attack on Heterogeneous Feed-Forward PUFs using only a reasonable amount of challenges and responses. We propose an extension version of our model, a Multi-gate Mixture-of-PUF-Experts (MMoPE) structure, facilitating multi-task learning across diverse PUFs to recognise commonalities across PUF designs. This allows a streamlining of training periods for attacking multiple PUFs simultaneously. We conclude by showcasing the potent performance of MoPE and MMoPE across a spectrum of PUF types, employing simulated, real-world unbiased, and biased data sets for analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00464v1-abstract-full').style.display = 'none'; document.getElementById('2403.00464v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures, journal</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68M25 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.8 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.11435">arXiv:2402.11435</a> <span> [<a href="https://arxiv.org/pdf/2402.11435">pdf</a>, <a href="https://arxiv.org/format/2402.11435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Momentor: Advancing Video Large Language Model with Fine-Grained Temporal Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+L">Long Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juncheng Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yaobo Ye</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yueting Zhuang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siliang Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.11435v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) demonstrate remarkable proficiency in comprehending and handling text-based tasks. Many efforts are being made to transfer these attributes to video modality, which are termed Video-LLMs. However, existing Video-LLMs can only capture the coarse-grained semantics and are unable to effectively handle tasks related to comprehension or localization of specific video segmen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11435v2-abstract-full').style.display = 'inline'; document.getElementById('2402.11435v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.11435v2-abstract-full" style="display: none;"> Large Language Models (LLMs) demonstrate remarkable proficiency in comprehending and handling text-based tasks. Many efforts are being made to transfer these attributes to video modality, which are termed Video-LLMs. However, existing Video-LLMs can only capture the coarse-grained semantics and are unable to effectively handle tasks related to comprehension or localization of specific video segments. In light of these challenges, we propose Momentor, a Video-LLM capable of accomplishing fine-grained temporal understanding tasks. To support the training of Momentor, we design an automatic data generation engine to construct Moment-10M, a large-scale video instruction dataset with segment-level instruction data. We train Momentor on Moment-10M, enabling it to perform segment-level reasoning and localization. Zero-shot evaluations on several tasks demonstrate that Momentor excels in fine-grained temporally grounded comprehension and localization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11435v2-abstract-full').style.display = 'none'; document.getElementById('2402.11435v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.01182">arXiv:2402.01182</a> <span> [<a href="https://arxiv.org/pdf/2402.01182">pdf</a>, <a href="https://arxiv.org/format/2402.01182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> In-Context Learning for Few-Shot Nested Named Entity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.01182v1-abstract-short" style="display: inline;"> In nested Named entity recognition (NER), entities are nested with each other, and thus requiring more data annotations to address. This leads to the development of few-shot nested NER, where the prevalence of pretrained language models with in-context learning (ICL) offers promising solutions. In this work, we introduce an effective and innovative ICL framework for the setting of few-shot nested… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01182v1-abstract-full').style.display = 'inline'; document.getElementById('2402.01182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.01182v1-abstract-full" style="display: none;"> In nested Named entity recognition (NER), entities are nested with each other, and thus requiring more data annotations to address. This leads to the development of few-shot nested NER, where the prevalence of pretrained language models with in-context learning (ICL) offers promising solutions. In this work, we introduce an effective and innovative ICL framework for the setting of few-shot nested NER. We improve the ICL prompt by devising a novel example demonstration selection mechanism, EnDe retriever. In EnDe retriever, we employ contrastive learning to perform three types of representation learning, in terms of semantic similarity, boundary similarity, and label similarity, to generate high-quality demonstration examples. Extensive experiments over three nested NER and four flat NER datasets demonstrate the efficacy of our system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01182v1-abstract-full').style.display = 'none'; document.getElementById('2402.01182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.15603">arXiv:2401.15603</a> <span> [<a href="https://arxiv.org/pdf/2401.15603">pdf</a>, <a href="https://arxiv.org/format/2401.15603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Improving Expressive Power of Spectral Graph Neural Networks with Eigenvalue Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+K">Kangkang Lu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yanhua Yu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zixuan Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zirui Guo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+M">Meiyu Liang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Mengran Yin</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.15603v2-abstract-short" style="display: inline;"> In recent years, spectral graph neural networks, characterized by polynomial filters, have garnered increasing attention and have achieved remarkable performance in tasks such as node classification. These models typically assume that eigenvalues for the normalized Laplacian matrix are distinct from each other, thus expecting a polynomial filter to have a high fitting ability. However, this paper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15603v2-abstract-full').style.display = 'inline'; document.getElementById('2401.15603v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.15603v2-abstract-full" style="display: none;"> In recent years, spectral graph neural networks, characterized by polynomial filters, have garnered increasing attention and have achieved remarkable performance in tasks such as node classification. These models typically assume that eigenvalues for the normalized Laplacian matrix are distinct from each other, thus expecting a polynomial filter to have a high fitting ability. However, this paper empirically observes that normalized Laplacian matrices frequently possess repeated eigenvalues. Moreover, we theoretically establish that the number of distinguishable eigenvalues plays a pivotal role in determining the expressive power of spectral graph neural networks. In light of this observation, we propose an eigenvalue correction strategy that can free polynomial filters from the constraints of repeated eigenvalue inputs. Concretely, the proposed eigenvalue correction strategy enhances the uniform distribution of eigenvalues, thus mitigating repeated eigenvalues, and improving the fitting capacity and expressive power of polynomial filters. Extensive experimental results on both synthetic and real-world datasets demonstrate the superiority of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15603v2-abstract-full').style.display = 'none'; document.getElementById('2401.15603v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI-24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10404">arXiv:2401.10404</a> <span> [<a href="https://arxiv.org/pdf/2401.10404">pdf</a>, <a href="https://arxiv.org/format/2401.10404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Inflation with Diffusion: Efficient Temporal Adaptation for Text-to-Video Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=Baek%2C+J">Jinoo Baek</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Keyang Xu</a>, <a href="/search/cs?searchtype=author&query=Tov%2C+O">Omer Tov</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hongliang Fei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10404v1-abstract-short" style="display: inline;"> We propose an efficient diffusion-based text-to-video super-resolution (SR) tuning approach that leverages the readily learned capacity of pixel level image diffusion model to capture spatial information for video generation. To accomplish this goal, we design an efficient architecture by inflating the weightings of the text-to-image SR model into our video generation framework. Additionally, we i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10404v1-abstract-full').style.display = 'inline'; document.getElementById('2401.10404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10404v1-abstract-full" style="display: none;"> We propose an efficient diffusion-based text-to-video super-resolution (SR) tuning approach that leverages the readily learned capacity of pixel level image diffusion model to capture spatial information for video generation. To accomplish this goal, we design an efficient architecture by inflating the weightings of the text-to-image SR model into our video generation framework. Additionally, we incorporate a temporal adapter to ensure temporal coherence across video frames. We investigate different tuning approaches based on our inflated architecture and report trade-offs between computational costs and super-resolution quality. Empirical evaluation, both quantitative and qualitative, on the Shutterstock video dataset, demonstrates that our approach is able to perform text-to-video SR generation with good visual quality and temporal consistency. To evaluate temporal coherence, we also present visualizations in video format in https://drive.google.com/drive/folders/1YVc-KMSJqOrEUdQWVaI-Yfu8Vsfu_1aO?usp=sharing . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10404v1-abstract-full').style.display = 'none'; document.getElementById('2401.10404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV'24 workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15291">arXiv:2312.15291</a> <span> [<a href="https://arxiv.org/pdf/2312.15291">pdf</a>, <a href="https://arxiv.org/format/2312.15291">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Reverse Multi-Choice Dialogue Commonsense Inference with Graph-of-Thought </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Li Zheng</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bobo Li</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+L">Lizi Liao</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Donghong Ji</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+C">Chong Teng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15291v2-abstract-short" style="display: inline;"> With the proliferation of dialogic data across the Internet, the Dialogue Commonsense Multi-choice Question Answering (DC-MCQ) task has emerged as a response to the challenge of comprehending user queries and intentions. Although prevailing methodologies exhibit effectiveness in addressing single-choice questions, they encounter difficulties in handling multi-choice queries due to the heightened i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15291v2-abstract-full').style.display = 'inline'; document.getElementById('2312.15291v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15291v2-abstract-full" style="display: none;"> With the proliferation of dialogic data across the Internet, the Dialogue Commonsense Multi-choice Question Answering (DC-MCQ) task has emerged as a response to the challenge of comprehending user queries and intentions. Although prevailing methodologies exhibit effectiveness in addressing single-choice questions, they encounter difficulties in handling multi-choice queries due to the heightened intricacy and informational density. In this paper, inspired by the human cognitive process of progressively excluding options, we propose a three-step Reverse Exclusion Graph-of-Thought (ReX-GoT) framework, including Option Exclusion, Error Analysis, and Combine Information. Specifically, our ReX-GoT mimics human reasoning by gradually excluding irrelevant options and learning the reasons for option errors to choose the optimal path of the GoT and ultimately infer the correct answer. By progressively integrating intricate clues, our method effectively reduces the difficulty of multi-choice reasoning and provides a novel solution for DC-MCQ. Extensive experiments on the CICERO and CICERO$_{v2}$ datasets validate the significant improvement of our approach on DC-MCQ task. On zero-shot setting, our model outperform the best baseline by 17.67% in terms of F1 score for the multi-choice task. Most strikingly, our GPT3.5-based ReX-GoT framework achieves a remarkable 39.44% increase in F1 score. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15291v2-abstract-full').style.display = 'none'; document.getElementById('2312.15291v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by the 38th Annual AAAI Conference on Artificial Intelligence (AAAI'24, FEBRUARY 20-27, 2024, VANCOUVER, CANADA)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11974">arXiv:2312.11974</a> <span> [<a href="https://arxiv.org/pdf/2312.11974">pdf</a>, <a href="https://arxiv.org/format/2312.11974">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Ms-senet: Enhancing Speech Emotion Recognition Through Multi-scale Feature Fusion With Squeeze-and-excitation Blocks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Mengbo Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yuanzhong Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dichucheng Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yulun Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaoxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Haojun Fei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11974v2-abstract-short" style="display: inline;"> Speech Emotion Recognition (SER) has become a growing focus of research in human-computer interaction. Spatiotemporal features play a crucial role in SER, yet current research lacks comprehensive spatiotemporal feature learning. This paper focuses on addressing this gap by proposing a novel approach. In this paper, we employ Convolutional Neural Network (CNN) with varying kernel sizes for spatial… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11974v2-abstract-full').style.display = 'inline'; document.getElementById('2312.11974v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11974v2-abstract-full" style="display: none;"> Speech Emotion Recognition (SER) has become a growing focus of research in human-computer interaction. Spatiotemporal features play a crucial role in SER, yet current research lacks comprehensive spatiotemporal feature learning. This paper focuses on addressing this gap by proposing a novel approach. In this paper, we employ Convolutional Neural Network (CNN) with varying kernel sizes for spatial and temporal feature extraction. Additionally, we introduce Squeeze-and-Excitation (SE) modules to capture and fuse multi-scale features, facilitating effective information fusion for improved emotion recognition and a deeper understanding of the temporal evolution of speech emotion. Moreover, we employ skip connections and Spatial Dropout (SD) layers to prevent overfitting and increase the model's depth. Our method outperforms the previous state-of-the-art method, achieving an average UAR and WAR improvement of 1.62% and 1.32%, respectively, across six benchmark SER datasets. Further experiments demonstrated that our method can fully extract spatiotemporal features in low-resource conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11974v2-abstract-full').style.display = 'none'; document.getElementById('2312.11974v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.18651">arXiv:2311.18651</a> <span> [<a href="https://arxiv.org/pdf/2311.18651">pdf</a>, <a href="https://arxiv.org/format/2311.18651">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LL3DA: Visual Interactive Instruction Tuning for Omni-3D Understanding, Reasoning, and Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sijin Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingsheng Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+G">Gang Yu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hongyuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jiayuan Fan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18651v1-abstract-short" style="display: inline;"> Recent advances in Large Multimodal Models (LMM) have made it possible for various applications in human-machine interactions. However, developing LMMs that can comprehend, reason, and plan in complex and diverse 3D environments remains a challenging topic, especially considering the demand for understanding permutation-invariant point cloud 3D representations of the 3D scene. Existing works seek… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18651v1-abstract-full').style.display = 'inline'; document.getElementById('2311.18651v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18651v1-abstract-full" style="display: none;"> Recent advances in Large Multimodal Models (LMM) have made it possible for various applications in human-machine interactions. However, developing LMMs that can comprehend, reason, and plan in complex and diverse 3D environments remains a challenging topic, especially considering the demand for understanding permutation-invariant point cloud 3D representations of the 3D scene. Existing works seek help from multi-view images, and project 2D features to 3D space as 3D scene representations. This, however, leads to huge computational overhead and performance degradation. In this paper, we present LL3DA, a Large Language 3D Assistant that takes point cloud as direct input and respond to both textual-instructions and visual-prompts. This help LMMs better comprehend human interactions and further help to remove the ambiguities in cluttered 3D scenes. Experiments show that LL3DA achieves remarkable results, and surpasses various 3D vision-language models on both 3D Dense Captioning and 3D Question Answering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18651v1-abstract-full').style.display = 'none'; document.getElementById('2311.18651v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://ll3da.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.12890">arXiv:2311.12890</a> <span> [<a href="https://arxiv.org/pdf/2311.12890">pdf</a>, <a href="https://arxiv.org/format/2311.12890">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> De-fine: Decomposing and Refining Visual Programs with Auto-Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+M">Minghe Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juncheng Li</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+L">Liang Pang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoming Wang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Z">Zheqi Lv</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siliang Tang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yueting Zhuang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.12890v3-abstract-short" style="display: inline;"> Visual programming, a modular and generalizable paradigm, integrates different modules and Python operators to solve various vision-language tasks. Unlike end-to-end models that need task-specific data, it advances in performing visual processing and reasoning in an unsupervised manner. Current visual programming methods generate programs in a single pass for each task where the ability to evaluat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12890v3-abstract-full').style.display = 'inline'; document.getElementById('2311.12890v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.12890v3-abstract-full" style="display: none;"> Visual programming, a modular and generalizable paradigm, integrates different modules and Python operators to solve various vision-language tasks. Unlike end-to-end models that need task-specific data, it advances in performing visual processing and reasoning in an unsupervised manner. Current visual programming methods generate programs in a single pass for each task where the ability to evaluate and optimize based on feedback, unfortunately, is lacking, which consequentially limits their effectiveness for complex, multi-step problems. Drawing inspiration from benders decomposition, we introduce De-fine, a training-free framework that automatically decomposes complex tasks into simpler subtasks and refines programs through auto-feedback. This model-agnostic approach can improve logical reasoning performance by integrating the strengths of multiple models. Our experiments across various visual tasks show that De-fine creates more robust programs. Moreover, viewing each feedback module as an independent agent will yield fresh prospects for the field of agent research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12890v3-abstract-full').style.display = 'none'; document.getElementById('2311.12890v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.12798">arXiv:2310.12798</a> <span> [<a href="https://arxiv.org/pdf/2310.12798">pdf</a>, <a href="https://arxiv.org/format/2310.12798">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal Adapter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sihang Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yanchen Luo</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yixin Cao</a>, <a href="/search/cs?searchtype=author&query=Kawaguchi%2C+K">Kenji Kawaguchi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.12798v4-abstract-short" style="display: inline;"> Language Models (LMs) have demonstrated impressive molecule understanding ability on various 1D text-related tasks. However, they inherently lack 2D graph perception - a critical ability of human professionals in comprehending molecules' topological structures. To bridge this gap, we propose MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal Adapter. MolCA enables an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12798v4-abstract-full').style.display = 'inline'; document.getElementById('2310.12798v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.12798v4-abstract-full" style="display: none;"> Language Models (LMs) have demonstrated impressive molecule understanding ability on various 1D text-related tasks. However, they inherently lack 2D graph perception - a critical ability of human professionals in comprehending molecules' topological structures. To bridge this gap, we propose MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal Adapter. MolCA enables an LM (e.g., Galactica) to understand both text- and graph-based molecular contents via the cross-modal projector. Specifically, the cross-modal projector is implemented as a Q-Former to connect a graph encoder's representation space and an LM's text space. Further, MolCA employs a uni-modal adapter (i.e., LoRA) for the LM's efficient adaptation to downstream tasks. Unlike previous studies that couple an LM with a graph encoder via cross-modal contrastive learning, MolCA retains the LM's ability of open-ended text generation and augments it with 2D graph information. To showcase its effectiveness, we extensively benchmark MolCA on tasks of molecule captioning, IUPAC name prediction, and molecule-text retrieval, on which MolCA significantly outperforms the baselines. Our codes and checkpoints can be found at https://github.com/acharkq/MolCA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12798v4-abstract-full').style.display = 'none'; document.getElementById('2310.12798v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP main conference. 9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.17205">arXiv:2309.17205</a> <span> [<a href="https://arxiv.org/pdf/2309.17205">pdf</a>, <a href="https://arxiv.org/format/2309.17205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Complex-query Referring Image Segmentation: A Novel Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiangyan Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xun Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juncheng Li</a>, <a href="/search/cs?searchtype=author&query=Zimmermann%2C+R">Roger Zimmermann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.17205v1-abstract-short" style="display: inline;"> Referring Image Understanding (RIS) has been extensively studied over the past decade, leading to the development of advanced algorithms. However, there has been a lack of research investigating how existing algorithms should be benchmarked with complex language queries, which include more informative descriptions of surrounding objects and backgrounds (\eg \textit{"the black car."} vs. \textit{"t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.17205v1-abstract-full').style.display = 'inline'; document.getElementById('2309.17205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.17205v1-abstract-full" style="display: none;"> Referring Image Understanding (RIS) has been extensively studied over the past decade, leading to the development of advanced algorithms. However, there has been a lack of research investigating how existing algorithms should be benchmarked with complex language queries, which include more informative descriptions of surrounding objects and backgrounds (\eg \textit{"the black car."} vs. \textit{"the black car is parking on the road and beside the bus."}). Given the significant improvement in the semantic understanding capability of large pre-trained models, it is crucial to take a step further in RIS by incorporating complex language that resembles real-world applications. To close this gap, building upon the existing RefCOCO and Visual Genome datasets, we propose a new RIS benchmark with complex queries, namely \textbf{RIS-CQ}. The RIS-CQ dataset is of high quality and large scale, which challenges the existing RIS with enriched, specific and informative queries, and enables a more realistic scenario of RIS research. Besides, we present a nichetargeting method to better task the RIS-CQ, called dual-modality graph alignment model (\textbf{\textsc{DuMoGa}}), which outperforms a series of RIS methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.17205v1-abstract-full').style.display = 'none'; document.getElementById('2309.17205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.11368">arXiv:2309.11368</a> <span> [<a href="https://arxiv.org/pdf/2309.11368">pdf</a>, <a href="https://arxiv.org/format/2309.11368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Hand Gesture-Featured Human Motor Adaptation in Tool Delivery using Voice Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fei%2C+H">Haolin Fei</a>, <a href="/search/cs?searchtype=author&query=Tedeschi%2C+S">Stefano Tedeschi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yanpei Huang</a>, <a href="/search/cs?searchtype=author&query=Kennedy%2C+A">Andrew Kennedy</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11368v1-abstract-short" style="display: inline;"> Human-robot collaboration has benefited users with higher efficiency towards interactive tasks. Nevertheless, most collaborative schemes rely on complicated human-machine interfaces, which might lack the requisite intuitiveness compared with natural limb control. We also expect to understand human intent with low training data requirements. In response to these challenges, this paper introduces an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11368v1-abstract-full').style.display = 'inline'; document.getElementById('2309.11368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11368v1-abstract-full" style="display: none;"> Human-robot collaboration has benefited users with higher efficiency towards interactive tasks. Nevertheless, most collaborative schemes rely on complicated human-machine interfaces, which might lack the requisite intuitiveness compared with natural limb control. We also expect to understand human intent with low training data requirements. In response to these challenges, this paper introduces an innovative human-robot collaborative framework that seamlessly integrates hand gesture and dynamic movement recognition, voice recognition, and a switchable control adaptation strategy. These modules provide a user-friendly approach that enables the robot to deliver the tools as per user need, especially when the user is working with both hands. Therefore, users can focus on their task execution without additional training in the use of human-machine interfaces, while the robot interprets their intuitive gestures. The proposed multimodal interaction framework is executed in the UR5e robot platform equipped with a RealSense D435i camera, and the effectiveness is assessed through a soldering circuit board task. The experiment results have demonstrated superior performance in hand gesture recognition, where the static hand gesture recognition module achieves an accuracy of 94.3\%, while the dynamic motion recognition module reaches 97.6\% accuracy. Compared with human solo manipulation, the proposed approach facilitates higher efficiency tool delivery, without significantly distracting from human intents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11368v1-abstract-full').style.display = 'none'; document.getElementById('2309.11368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.05519">arXiv:2309.05519</a> <span> [<a href="https://arxiv.org/pdf/2309.05519">pdf</a>, <a href="https://arxiv.org/format/2309.05519">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NExT-GPT: Any-to-Any Multimodal LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Leigang Qu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.05519v3-abstract-short" style="display: inline;"> While recently Multimodal Large Language Models (MM-LLMs) have made exciting strides, they mostly fall prey to the limitation of only input-side multimodal understanding, without the ability to produce content in multiple modalities. As we humans always perceive the world and communicate with people through various modalities, developing any-to-any MM-LLMs capable of accepting and delivering conte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.05519v3-abstract-full').style.display = 'inline'; document.getElementById('2309.05519v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.05519v3-abstract-full" style="display: none;"> While recently Multimodal Large Language Models (MM-LLMs) have made exciting strides, they mostly fall prey to the limitation of only input-side multimodal understanding, without the ability to produce content in multiple modalities. As we humans always perceive the world and communicate with people through various modalities, developing any-to-any MM-LLMs capable of accepting and delivering content in any modality becomes essential to human-level AI. To fill the gap, we present an end-to-end general-purpose any-to-any MM-LLM system, NExT-GPT. We connect an LLM with multimodal adaptors and different diffusion decoders, enabling NExT-GPT to perceive inputs and generate outputs in arbitrary combinations of text, images, videos, and audio. By leveraging the existing well-trained highly-performing encoders and decoders, NExT-GPT is tuned with only a small amount of parameter (1%) of certain projection layers, which not only benefits low-cost training and also facilitates convenient expansion to more potential modalities. Moreover, we introduce a modality-switching instruction tuning (MosIT) and manually curate a high-quality dataset for MosIT, based on which NExT-GPT is empowered with complex cross-modal semantic understanding and content generation. Overall, our research showcases the promising possibility of building an AI agent capable of modeling universal modalities, paving the way for more human-like AI research in the community. Project page: https://next-gpt.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.05519v3-abstract-full').style.display = 'none'; document.getElementById('2309.05519v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024 (Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.13812">arXiv:2308.13812</a> <span> [<a href="https://arxiv.org/pdf/2308.13812">pdf</a>, <a href="https://arxiv.org/format/2308.13812">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dysen-VDM: Empowering Dynamics-aware Text-to-Video Diffusion with LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.13812v2-abstract-short" style="display: inline;"> Text-to-video (T2V) synthesis has gained increasing attention in the community, in which the recently emerged diffusion models (DMs) have promisingly shown stronger performance than the past approaches. While existing state-of-the-art DMs are competent to achieve high-resolution video generation, they may largely suffer from key limitations (e.g., action occurrence disorders, crude video motions)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13812v2-abstract-full').style.display = 'inline'; document.getElementById('2308.13812v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.13812v2-abstract-full" style="display: none;"> Text-to-video (T2V) synthesis has gained increasing attention in the community, in which the recently emerged diffusion models (DMs) have promisingly shown stronger performance than the past approaches. While existing state-of-the-art DMs are competent to achieve high-resolution video generation, they may largely suffer from key limitations (e.g., action occurrence disorders, crude video motions) with respect to the intricate temporal dynamics modeling, one of the crux of video synthesis. In this work, we investigate strengthening the awareness of video dynamics for DMs, for high-quality T2V generation. Inspired by human intuition, we design an innovative dynamic scene manager (dubbed as Dysen) module, which includes (step-1) extracting from input text the key actions with proper time-order arrangement, (step-2) transforming the action schedules into the dynamic scene graph (DSG) representations, and (step-3) enriching the scenes in the DSG with sufficient and reasonable details. Taking advantage of the existing powerful LLMs (e.g., ChatGPT) via in-context learning, Dysen realizes (nearly) human-level temporal dynamics understanding. Finally, the resulting video DSG with rich action scene details is encoded as fine-grained spatio-temporal features, integrated into the backbone T2V DM for video generating. Experiments on popular T2V datasets suggest that our Dysen-VDM consistently outperforms prior arts with significant margins, especially in scenarios with complex actions. Codes at https://haofei.vip/Dysen-VDM <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13812v2-abstract-full').style.display = 'none'; document.getElementById('2308.13812v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.10025">arXiv:2308.10025</a> <span> [<a href="https://arxiv.org/pdf/2308.10025">pdf</a>, <a href="https://arxiv.org/format/2308.10025">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> I3: Intent-Introspective Retrieval Conditioned on Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+K">Kaihang Pan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juncheng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Hongye Song</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jun Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaozhong Liu</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siliang Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.10025v2-abstract-short" style="display: inline;"> Recent studies indicate that dense retrieval models struggle to perform well on a wide variety of retrieval tasks that lack dedicated training data, as different retrieval tasks often entail distinct search intents. To address this challenge, in this work we leverage instructions to flexibly describe retrieval intents and introduce I3, a unified retrieval system that performs Intent-Introspective… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10025v2-abstract-full').style.display = 'inline'; document.getElementById('2308.10025v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.10025v2-abstract-full" style="display: none;"> Recent studies indicate that dense retrieval models struggle to perform well on a wide variety of retrieval tasks that lack dedicated training data, as different retrieval tasks often entail distinct search intents. To address this challenge, in this work we leverage instructions to flexibly describe retrieval intents and introduce I3, a unified retrieval system that performs Intent-Introspective retrieval across various tasks, conditioned on Instructions without any task-specific training. I3 innovatively incorporates a pluggable introspector in a parameter-isolated manner to comprehend specific retrieval intents by jointly reasoning over the input query and instruction, and seamlessly integrates the introspected intent into the original retrieval model for intent-aware retrieval. Furthermore, we propose progressively-pruned intent learning. It utilizes extensive LLM-generated data to train I3 phase-by-phase, embodying two key designs: progressive structure pruning and drawback extrapolation-based data refinement. Extensive experiments show that in the BEIR benchmark, I3 significantly outperforms baseline methods designed with task-specific retrievers, achieving state-of-the-art zero-shot performance without any task-specific tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10025v2-abstract-full').style.display = 'none'; document.getElementById('2308.10025v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SIGIR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.05095">arXiv:2308.05095</a> <span> [<a href="https://arxiv.org/pdf/2308.05095">pdf</a>, <a href="https://arxiv.org/format/2308.05095">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LayoutLLM-T2I: Eliciting Layout Guidance from LLM for Text-to-Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+L">Leigang Qu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+L">Liqiang Nie</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.05095v2-abstract-short" style="display: inline;"> In the text-to-image generation field, recent remarkable progress in Stable Diffusion makes it possible to generate rich kinds of novel photorealistic images. However, current models still face misalignment issues (e.g., problematic spatial relation understanding and numeration failure) in complex natural scenes, which impedes the high-faithfulness text-to-image generation. Although recent efforts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05095v2-abstract-full').style.display = 'inline'; document.getElementById('2308.05095v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.05095v2-abstract-full" style="display: none;"> In the text-to-image generation field, recent remarkable progress in Stable Diffusion makes it possible to generate rich kinds of novel photorealistic images. However, current models still face misalignment issues (e.g., problematic spatial relation understanding and numeration failure) in complex natural scenes, which impedes the high-faithfulness text-to-image generation. Although recent efforts have been made to improve controllability by giving fine-grained guidance (e.g., sketch and scribbles), this issue has not been fundamentally tackled since users have to provide such guidance information manually. In this work, we strive to synthesize high-fidelity images that are semantically aligned with a given textual prompt without any guidance. Toward this end, we propose a coarse-to-fine paradigm to achieve layout planning and image generation. Concretely, we first generate the coarse-grained layout conditioned on a given textual prompt via in-context learning based on Large Language Models. Afterward, we propose a fine-grained object-interaction diffusion method to synthesize high-faithfulness images conditioned on the prompt and the automatically generated layout. Extensive experiments demonstrate that our proposed method outperforms the state-of-the-art models in terms of layout and image generation. Our code and settings are available at https://layoutllm-t2i.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05095v2-abstract-full').style.display = 'none'; document.getElementById('2308.05095v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.05081">arXiv:2308.05081</a> <span> [<a href="https://arxiv.org/pdf/2308.05081">pdf</a>, <a href="https://arxiv.org/format/2308.05081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Constructing Holistic Spatio-Temporal Scene Graph for Video Semantic Role Labeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yixin Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bobo Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jianguo Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.05081v2-abstract-short" style="display: inline;"> Video Semantic Role Labeling (VidSRL) aims to detect the salient events from given videos, by recognizing the predict-argument event structures and the interrelationships between events. While recent endeavors have put forth methods for VidSRL, they can be mostly subject to two key drawbacks, including the lack of fine-grained spatial scene perception and the insufficiently modeling of video tempo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05081v2-abstract-full').style.display = 'inline'; document.getElementById('2308.05081v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.05081v2-abstract-full" style="display: none;"> Video Semantic Role Labeling (VidSRL) aims to detect the salient events from given videos, by recognizing the predict-argument event structures and the interrelationships between events. While recent endeavors have put forth methods for VidSRL, they can be mostly subject to two key drawbacks, including the lack of fine-grained spatial scene perception and the insufficiently modeling of video temporality. Towards this end, this work explores a novel holistic spatio-temporal scene graph (namely HostSG) representation based on the existing dynamic scene graph structures, which well model both the fine-grained spatial semantics and temporal dynamics of videos for VidSRL. Built upon the HostSG, we present a nichetargeting VidSRL framework. A scene-event mapping mechanism is first designed to bridge the gap between the underlying scene structure and the high-level event semantic structure, resulting in an overall hierarchical scene-event (termed ICE) graph structure. We further perform iterative structure refinement to optimize the ICE graph, such that the overall structure representation can best coincide with end task demand. Finally, three subtask predictions of VidSRL are jointly decoded, where the end-to-end paradigm effectively avoids error propagation. On the benchmark dataset, our framework boosts significantly over the current best-performing model. Further analyses are shown for a better understanding of the advances of our methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05081v2-abstract-full').style.display = 'none'; document.getElementById('2308.05081v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.04502">arXiv:2308.04502</a> <span> [<a href="https://arxiv.org/pdf/2308.04502">pdf</a>, <a href="https://arxiv.org/format/2308.04502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Disentanglement and Fusion on Modality and Context in Conversational Multimodal Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bobo Li</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+L">Lizi Liao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+C">Chong Teng</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Donghong Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.04502v2-abstract-short" style="display: inline;"> It has been a hot research topic to enable machines to understand human emotions in multimodal contexts under dialogue scenarios, which is tasked with multimodal emotion analysis in conversation (MM-ERC). MM-ERC has received consistent attention in recent years, where a diverse range of methods has been proposed for securing better task performance. Most existing works treat MM-ERC as a standard m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04502v2-abstract-full').style.display = 'inline'; document.getElementById('2308.04502v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.04502v2-abstract-full" style="display: none;"> It has been a hot research topic to enable machines to understand human emotions in multimodal contexts under dialogue scenarios, which is tasked with multimodal emotion analysis in conversation (MM-ERC). MM-ERC has received consistent attention in recent years, where a diverse range of methods has been proposed for securing better task performance. Most existing works treat MM-ERC as a standard multimodal classification problem and perform multimodal feature disentanglement and fusion for maximizing feature utility. Yet after revisiting the characteristic of MM-ERC, we argue that both the feature multimodality and conversational contextualization should be properly modeled simultaneously during the feature disentanglement and fusion steps. In this work, we target further pushing the task performance by taking full consideration of the above insights. On the one hand, during feature disentanglement, based on the contrastive learning technique, we devise a Dual-level Disentanglement Mechanism (DDM) to decouple the features into both the modality space and utterance space. On the other hand, during the feature fusion stage, we propose a Contribution-aware Fusion Mechanism (CFM) and a Context Refusion Mechanism (CRM) for multimodal and context integration, respectively. They together schedule the proper integrations of multimodal and context features. Specifically, CFM explicitly manages the multimodal feature contributions dynamically, while CRM flexibly coordinates the introduction of dialogue contexts. On two public MM-ERC datasets, our system achieves new state-of-the-art performance consistently. Further analyses demonstrate that all our proposed mechanisms greatly facilitate the MM-ERC task by making full use of the multimodal and context features adaptively. Note that our proposed methods have the great potential to facilitate a broader range of other conversational multimodal tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04502v2-abstract-full').style.display = 'none'; document.getElementById('2308.04502v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.04498">arXiv:2308.04498</a> <span> [<a href="https://arxiv.org/pdf/2308.04498">pdf</a>, <a href="https://arxiv.org/format/2308.04498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DialogRE^C+: An Extension of DialogRE to Investigate How Much Coreference Helps Relation Extraction in Dialogs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yiyun Xiong</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+M">Mengwei Dai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fei Li</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bobo Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Donghong Ji</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+C">Chong Teng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.04498v2-abstract-short" style="display: inline;"> Dialogue relation extraction (DRE) that identifies the relations between argument pairs in dialogue text, suffers much from the frequent occurrence of personal pronouns, or entity and speaker coreference. This work introduces a new benchmark dataset DialogRE^C+, introducing coreference resolution into the DRE scenario. With the aid of high-quality coreference knowledge, the reasoning of argument r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04498v2-abstract-full').style.display = 'inline'; document.getElementById('2308.04498v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.04498v2-abstract-full" style="display: none;"> Dialogue relation extraction (DRE) that identifies the relations between argument pairs in dialogue text, suffers much from the frequent occurrence of personal pronouns, or entity and speaker coreference. This work introduces a new benchmark dataset DialogRE^C+, introducing coreference resolution into the DRE scenario. With the aid of high-quality coreference knowledge, the reasoning of argument relations is expected to be enhanced. In DialogRE^C+ dataset, we manually annotate total 5,068 coreference chains over 36,369 argument mentions based on the existing DialogRE data, where four different coreference chain types namely speaker chain, person chain, location chain and organization chain are explicitly marked. We further develop 4 coreference-enhanced graph-based DRE models, which learn effective coreference representations for improving the DRE task. We also train a coreference resolution model based on our annotations and evaluate the effect of automatically extracted coreference chains demonstrating the practicality of our dataset and its potential to other domains and tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04498v2-abstract-full').style.display = 'none'; document.getElementById('2308.04498v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NLPCC 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.04424">arXiv:2308.04424</a> <span> [<a href="https://arxiv.org/pdf/2308.04424">pdf</a>, <a href="https://arxiv.org/format/2308.04424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Bi-directional Multi-hop Inference Model for Joint Dialog Sentiment Classification and Act Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Li Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fei Li</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+Y">Yuyang Chai</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+C">Chong Teng</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Donghong Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.04424v2-abstract-short" style="display: inline;"> The joint task of Dialog Sentiment Classification (DSC) and Act Recognition (DAR) aims to predict the sentiment label and act label for each utterance in a dialog simultaneously. However, current methods encode the dialog context in only one direction, which limits their ability to thoroughly comprehend the context. Moreover, these methods overlook the explicit correlations between sentiment and a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04424v2-abstract-full').style.display = 'inline'; document.getElementById('2308.04424v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.04424v2-abstract-full" style="display: none;"> The joint task of Dialog Sentiment Classification (DSC) and Act Recognition (DAR) aims to predict the sentiment label and act label for each utterance in a dialog simultaneously. However, current methods encode the dialog context in only one direction, which limits their ability to thoroughly comprehend the context. Moreover, these methods overlook the explicit correlations between sentiment and act labels, which leads to an insufficient ability to capture rich sentiment and act clues and hinders effective and accurate reasoning. To address these issues, we propose a Bi-directional Multi-hop Inference Model (BMIM) that leverages a feature selection network and a bi-directional multi-hop inference network to iteratively extract and integrate rich sentiment and act clues in a bi-directional manner. We also employ contrastive learning and dual learning to explicitly model the correlations of sentiment and act labels. Our experiments on two widely-used datasets show that BMIM outperforms state-of-the-art baselines by at least 2.6% on F1 score in DAR and 1.4% on F1 score in DSC. Additionally, Our proposed model not only improves the performance but also enhances the interpretability of the joint sentiment and act prediction task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04424v2-abstract-full').style.display = 'none'; document.getElementById('2308.04424v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NLPCC 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.01846">arXiv:2308.01846</a> <span> [<a href="https://arxiv.org/pdf/2308.01846">pdf</a>, <a href="https://arxiv.org/format/2308.01846">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> XNLP: An Interactive Demonstration System for Universal Structured NLP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.01846v2-abstract-short" style="display: inline;"> Structured Natural Language Processing (XNLP) is an important subset of NLP that entails understanding the underlying semantic or syntactic structure of texts, which serves as a foundational component for many downstream applications. Despite certain recent efforts to explore universal solutions for specific categories of XNLP tasks, a comprehensive and effective approach for unifying all XNLP tas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.01846v2-abstract-full').style.display = 'inline'; document.getElementById('2308.01846v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.01846v2-abstract-full" style="display: none;"> Structured Natural Language Processing (XNLP) is an important subset of NLP that entails understanding the underlying semantic or syntactic structure of texts, which serves as a foundational component for many downstream applications. Despite certain recent efforts to explore universal solutions for specific categories of XNLP tasks, a comprehensive and effective approach for unifying all XNLP tasks long remains underdeveloped. In the meanwhile, while XNLP demonstration systems are vital for researchers exploring various XNLP tasks, existing platforms can be limited to, e.g., supporting few XNLP tasks, lacking interactivity and universalness. To this end, we propose an advanced XNLP demonstration platform, where we propose leveraging LLM to achieve universal XNLP, with one model for all with high generalizability. Overall, our system advances in multiple aspects, including universal XNLP modeling, high performance, interpretability, scalability, and interactivity, providing a unified platform for exploring diverse XNLP tasks in the community. XNLP is online: https://xnlp.haofei.vip <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.01846v2-abstract-full').style.display = 'none'; document.getElementById('2308.01846v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 Demonstration Paper</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Fei%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Fei%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Fei%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository