Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 453 results for author: <span class="mathjax">Chang, K</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chang%2C+K">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chang, K"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chang%2C+K&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chang, K"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chang%2C+K&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chang%2C+K&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+K&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+K&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+K&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+K&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07820">arXiv:2411.07820</a> <span> [<a href="https://arxiv.org/pdf/2411.07820">pdf</a>, <a href="https://arxiv.org/format/2411.07820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Query Optimization for Parametric Knowledge Refinement in Retrieval-Augmented Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cong%2C+Y">Youan Cong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&query=Akash%2C+P+S">Pritom Saha Akash</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K+C">Kevin Chen-Chuan Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07820v2-abstract-short" style="display: inline;"> We introduce the Extract-Refine-Retrieve-Read (ERRR) framework, a novel approach designed to bridge the pre-retrieval information gap in Retrieval-Augmented Generation (RAG) systems through query optimization tailored to meet the specific knowledge requirements of Large Language Models (LLMs). Unlike conventional query optimization techniques used in RAG, the ERRR framework begins by extracting pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07820v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07820v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07820v2-abstract-full" style="display: none;"> We introduce the Extract-Refine-Retrieve-Read (ERRR) framework, a novel approach designed to bridge the pre-retrieval information gap in Retrieval-Augmented Generation (RAG) systems through query optimization tailored to meet the specific knowledge requirements of Large Language Models (LLMs). Unlike conventional query optimization techniques used in RAG, the ERRR framework begins by extracting parametric knowledge from LLMs, followed by using a specialized query optimizer for refining these queries. This process ensures the retrieval of only the most pertinent information essential for generating accurate responses. Moreover, to enhance flexibility and reduce computational costs, we propose a trainable scheme for our pipeline that utilizes a smaller, tunable model as the query optimizer, which is refined through knowledge distillation from a larger teacher model. Our evaluations on various question-answering (QA) datasets and with different retrieval systems show that ERRR consistently outperforms existing baselines, proving to be a versatile and cost-effective module for improving the utility and accuracy of RAG systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07820v2-abstract-full').style.display = 'none'; document.getElementById('2411.07820v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05361">arXiv:2411.05361</a> <span> [<a href="https://arxiv.org/pdf/2411.05361">pdf</a>, <a href="https://arxiv.org/format/2411.05361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-SUPERB Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/cs?searchtype=author&query=Diwan%2C+A">Anuj Diwan</a>, <a href="/search/cs?searchtype=author&query=Shih%2C+Y">Yi-Jen Shih</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Hsiao%2C+C">Chi-Yuan Hsiao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+P">Puyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shih-Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Ritter-Gutierrez%2C+F">Fabian Ritter-Gutierrez</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+M+T">Ming To Chuang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kuan-Po Huang</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">You-Kuan Lin</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+E">Eunjung Yeo</a> , et al. (53 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05361v1-abstract-short" style="display: inline;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05361v1-abstract-full" style="display: none;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluation benchmark poses a significant challenge. We present Dynamic-SUPERB Phase-2, an open and evolving benchmark for the comprehensive evaluation of instruction-based universal speech models. Building upon the first generation, this second version incorporates 125 new tasks contributed collaboratively by the global research community, expanding the benchmark to a total of 180 tasks, making it the largest benchmark for speech and audio evaluation. While the first generation of Dynamic-SUPERB was limited to classification tasks, Dynamic-SUPERB Phase-2 broadens its evaluation capabilities by introducing a wide array of novel and diverse tasks, including regression and sequence generation, across speech, music, and environmental audio. Evaluation results indicate that none of the models performed well universally. SALMONN-13B excelled in English ASR, while WavLLM demonstrated high accuracy in emotion recognition, but current models still require further innovations to handle a broader range of tasks. We will soon open-source all task data and the evaluation pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'none'; document.getElementById('2411.05361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04335">arXiv:2411.04335</a> <span> [<a href="https://arxiv.org/pdf/2411.04335">pdf</a>, <a href="https://arxiv.org/format/2411.04335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GazeGen: Gaze-Driven User Interaction for Visual Content Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hsieh%2C+H">He-Yen Hsieh</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziyun Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S+Q">Sai Qian Zhang</a>, <a href="/search/cs?searchtype=author&query=Ting%2C+W+M">Wei-Te Mark Ting</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kao-Den Chang</a>, <a href="/search/cs?searchtype=author&query=De+Salvo%2C+B">Barbara De Salvo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chiao Liu</a>, <a href="/search/cs?searchtype=author&query=Kung%2C+H+T">H. T. Kung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04335v2-abstract-short" style="display: inline;"> We present GazeGen, a user interaction system that generates visual content (images and videos) for locations indicated by the user's eye gaze. GazeGen allows intuitive manipulation of visual content by targeting regions of interest with gaze. Using advanced techniques in object detection and generative AI, GazeGen performs gaze-controlled image adding/deleting, repositioning, and surface style ch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04335v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04335v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04335v2-abstract-full" style="display: none;"> We present GazeGen, a user interaction system that generates visual content (images and videos) for locations indicated by the user's eye gaze. GazeGen allows intuitive manipulation of visual content by targeting regions of interest with gaze. Using advanced techniques in object detection and generative AI, GazeGen performs gaze-controlled image adding/deleting, repositioning, and surface style changes of image objects, and converts static images into videos. Central to GazeGen is the DFT Gaze (Distilled and Fine-Tuned Gaze) agent, an ultra-lightweight model with only 281K parameters, performing accurate real-time gaze predictions tailored to individual users' eyes on small edge devices. GazeGen is the first system to combine visual content generation with real-time gaze estimation, made possible exclusively by DFT Gaze. This real-time gaze estimation enables various visual content generation tasks, all controlled by the user's gaze. The input for DFT Gaze is the user's eye images, while the inputs for visual content generation are the user's view and the predicted gaze point from DFT Gaze. To achieve efficient gaze predictions, we derive the small model from a large model (10x larger) via novel knowledge distillation and personal adaptation techniques. We integrate knowledge distillation with a masked autoencoder, developing a compact yet powerful gaze estimation model. This model is further fine-tuned with Adapters, enabling highly accurate and personalized gaze predictions with minimal user input. DFT Gaze ensures low-latency and precise gaze tracking, supporting a wide range of gaze-driven tasks. We validate the performance of DFT Gaze on AEA and OpenEDS2020 benchmarks, demonstrating low angular gaze error and low latency on the edge device (Raspberry Pi 4). Furthermore, we describe applications of GazeGen, illustrating its versatility and effectiveness in various usage scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04335v2-abstract-full').style.display = 'none'; document.getElementById('2411.04335v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23277">arXiv:2410.23277</a> <span> [<a href="https://arxiv.org/pdf/2410.23277">pdf</a>, <a href="https://arxiv.org/format/2410.23277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yining Hong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Beide Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Maxine Wu</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yuanhao Zhai</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kevin Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chung-Ching Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yingnian Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23277v2-abstract-short" style="display: inline;"> Human beings are endowed with a complementary learning system, which bridges the slow learning of general world dynamics with fast storage of episodic memory from a new experience. Previous video generation models, however, primarily focus on slow learning by pre-training on vast amounts of data, overlooking the fast learning phase crucial for episodic memory storage. This oversight leads to incon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23277v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23277v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23277v2-abstract-full" style="display: none;"> Human beings are endowed with a complementary learning system, which bridges the slow learning of general world dynamics with fast storage of episodic memory from a new experience. Previous video generation models, however, primarily focus on slow learning by pre-training on vast amounts of data, overlooking the fast learning phase crucial for episodic memory storage. This oversight leads to inconsistencies across temporally distant frames when generating longer videos, as these frames fall beyond the model's context window. To this end, we introduce SlowFast-VGen, a novel dual-speed learning system for action-driven long video generation. Our approach incorporates a masked conditional video diffusion model for the slow learning of world dynamics, alongside an inference-time fast learning strategy based on a temporal LoRA module. Specifically, the fast learning process updates its temporal LoRA parameters based on local inputs and outputs, thereby efficiently storing episodic memory in its parameters. We further propose a slow-fast learning loop algorithm that seamlessly integrates the inner fast learning loop into the outer slow learning loop, enabling the recall of prior multi-episode experiences for context-aware skill learning. To facilitate the slow learning of an approximate world model, we collect a large-scale dataset of 200k videos with language action annotations, covering a wide range of scenarios. Extensive experiments show that SlowFast-VGen outperforms baselines across various metrics for action-driven video generation, achieving an FVD score of 514 compared to 782, and maintaining consistency in longer videos, with an average of 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm significantly enhances performances on long-horizon planning tasks as well. Project Website: https://slowfast-vgen.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23277v2-abstract-full').style.display = 'none'; document.getElementById('2410.23277v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22086">arXiv:2410.22086</a> <span> [<a href="https://arxiv.org/pdf/2410.22086">pdf</a>, <a href="https://arxiv.org/format/2410.22086">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unlearning as multi-task optimization: A normalized gradient difference approach with an adaptive learning rate </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bu%2C+Z">Zhiqi Bu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xiaomeng Jin</a>, <a href="/search/cs?searchtype=author&query=Vinzamuri%2C+B">Bhanukiran Vinzamuri</a>, <a href="/search/cs?searchtype=author&query=Ramakrishna%2C+A">Anil Ramakrishna</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Cevher%2C+V">Volkan Cevher</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+M">Mingyi Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22086v2-abstract-short" style="display: inline;"> Machine unlearning has been used to remove unwanted knowledge acquired by large language models (LLMs). In this paper, we examine machine unlearning from an optimization perspective, framing it as a regularized multi-task optimization problem, where one task optimizes a forgetting objective and another optimizes the model performance. In particular, we introduce a normalized gradient difference (N… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22086v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22086v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22086v2-abstract-full" style="display: none;"> Machine unlearning has been used to remove unwanted knowledge acquired by large language models (LLMs). In this paper, we examine machine unlearning from an optimization perspective, framing it as a regularized multi-task optimization problem, where one task optimizes a forgetting objective and another optimizes the model performance. In particular, we introduce a normalized gradient difference (NGDiff) algorithm, enabling us to have better control over the trade-off between the objectives, while integrating a new, automatic learning rate scheduler. We provide a theoretical analysis and empirically demonstrate the superior performance of NGDiff among state-of-the-art unlearning methods on the TOFU and MUSE datasets while exhibiting stable training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22086v2-abstract-full').style.display = 'none'; document.getElementById('2410.22086v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20021">arXiv:2410.20021</a> <span> [<a href="https://arxiv.org/pdf/2410.20021">pdf</a>, <a href="https://arxiv.org/format/2410.20021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Think Carefully and Check Again! Meta-Generation Unlocking LLMs for Low-Resource Cross-Lingual Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhecheng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Hooi%2C+B">Bryan Hooi</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yujun Cai</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+N">Naifan Cheung</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-wei Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20021v1-abstract-short" style="display: inline;"> Cross-lingual summarization (CLS) aims to generate a summary for the source text in a different target language. Currently, instruction-tuned large language models (LLMs) excel at various English tasks. However, unlike languages such as English, Chinese or Spanish, for those relatively low-resource languages with limited usage or data, recent studies have shown that LLMs' performance on CLS tasks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20021v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20021v1-abstract-full" style="display: none;"> Cross-lingual summarization (CLS) aims to generate a summary for the source text in a different target language. Currently, instruction-tuned large language models (LLMs) excel at various English tasks. However, unlike languages such as English, Chinese or Spanish, for those relatively low-resource languages with limited usage or data, recent studies have shown that LLMs' performance on CLS tasks remains unsatisfactory even with few-shot settings. This raises the question: Are LLMs capable of handling cross-lingual summarization tasks for low-resource languages? To resolve this question, we fully explore the potential of large language models on cross-lingual summarization task for low-resource languages through our four-step zero-shot method: Summarization, Improvement, Translation and Refinement (SITR) with correspondingly designed prompts. We test our proposed method with multiple LLMs on two well-known cross-lingual summarization datasets with various low-resource target languages. The results show that: i) GPT-3.5 and GPT-4 significantly and consistently outperform other baselines when using our zero-shot SITR methods. ii) By employing our proposed method, we unlock the potential of LLMs, enabling them to effectively handle cross-lingual summarization tasks for relatively low-resource languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20021v1-abstract-full').style.display = 'none'; document.getElementById('2410.20021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20016">arXiv:2410.20016</a> <span> [<a href="https://arxiv.org/pdf/2410.20016">pdf</a>, <a href="https://arxiv.org/format/2410.20016">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Vulnerability of LLMs to Vertically Aligned Text Manipulations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhecheng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Hooi%2C+B">Bryan Hooi</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yujun Cai</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zhen Xiong</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-wei Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20016v1-abstract-short" style="display: inline;"> Text classification involves categorizing a given text, such as determining its sentiment or identifying harmful content. With the advancement of large language models (LLMs), these models have become highly effective at performing text classification tasks. However, they still show vulnerabilities to variations in text formatting. Recent research demonstrates that modifying input formats, such as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20016v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20016v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20016v1-abstract-full" style="display: none;"> Text classification involves categorizing a given text, such as determining its sentiment or identifying harmful content. With the advancement of large language models (LLMs), these models have become highly effective at performing text classification tasks. However, they still show vulnerabilities to variations in text formatting. Recent research demonstrates that modifying input formats, such as vertically aligning words for encoder-based models, can substantially lower accuracy in text classification tasks. While easily understood by humans, these inputs can significantly mislead models, posing a potential risk of bypassing detection in real-world scenarios involving harmful or sensitive information. With the expanding application of LLMs, a crucial question arises: Do decoder-based LLMs exhibit similar vulnerabilities to vertically formatted text input? In this paper, we investigate the impact of vertical text input on the performance of various LLMs across multiple text classification datasets and analyze the underlying causes. Our findings are as follows: (i) Vertical text input significantly degrades the accuracy of LLMs in text classification tasks. (ii) Chain of Thought (CoT) reasoning does not help LLMs recognize vertical input or mitigate its vulnerability, but few-shot learning with careful analysis does. (iii) We explore the underlying cause of the vulnerability by analyzing the inherent issues in tokenization and attention matrices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20016v1-abstract-full').style.display = 'none'; document.getElementById('2410.20016v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18393">arXiv:2410.18393</a> <span> [<a href="https://arxiv.org/pdf/2410.18393">pdf</a>, <a href="https://arxiv.org/format/2410.18393">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> SPEED++: A Multilingual Event Extraction Framework for Epidemic Prediction and Preparedness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Parekh%2C+T">Tanmay Parekh</a>, <a href="/search/cs?searchtype=author&query=Kwan%2C+J">Jeffrey Kwan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiarui Yu</a>, <a href="/search/cs?searchtype=author&query=Johri%2C+S">Sparsh Johri</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+H">Hyosang Ahn</a>, <a href="/search/cs?searchtype=author&query=Muppalla%2C+S">Sreya Muppalla</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18393v1-abstract-short" style="display: inline;"> Social media is often the first place where communities discuss the latest societal trends. Prior works have utilized this platform to extract epidemic-related information (e.g. infections, preventive measures) to provide early warnings for epidemic prediction. However, these works only focused on English posts, while epidemics can occur anywhere in the world, and early discussions are often in th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18393v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18393v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18393v1-abstract-full" style="display: none;"> Social media is often the first place where communities discuss the latest societal trends. Prior works have utilized this platform to extract epidemic-related information (e.g. infections, preventive measures) to provide early warnings for epidemic prediction. However, these works only focused on English posts, while epidemics can occur anywhere in the world, and early discussions are often in the local, non-English languages. In this work, we introduce the first multilingual Event Extraction (EE) framework SPEED++ for extracting epidemic event information for a wide range of diseases and languages. To this end, we extend a previous epidemic ontology with 20 argument roles; and curate our multilingual EE dataset SPEED++ comprising 5.1K tweets in four languages for four diseases. Annotating data in every language is infeasible; thus we develop zero-shot cross-lingual cross-disease models (i.e., training only on English COVID data) utilizing multilingual pre-training and show their efficacy in extracting epidemic-related events for 65 diverse languages across different diseases. Experiments demonstrate that our framework can provide epidemic warnings for COVID-19 in its earliest stages in Dec 2019 (3 weeks before global discussions) from Chinese Weibo posts without any training in Chinese. Furthermore, we exploit our framework's argument extraction capabilities to aggregate community epidemic discussions like symptoms and cure measures, aiding misinformation detection and public attention monitoring. Overall, we lay a strong foundation for multilingual epidemic preparedness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18393v1-abstract-full').style.display = 'none'; document.getElementById('2410.18393v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15511">arXiv:2410.15511</a> <span> [<a href="https://arxiv.org/pdf/2410.15511">pdf</a>, <a href="https://arxiv.org/format/2410.15511">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> ConTReGen: Context-driven Tree-structured Retrieval for Open-domain Long-form Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Roy%2C+K+K">Kashob Kumar Roy</a>, <a href="/search/cs?searchtype=author&query=Akash%2C+P+S">Pritom Saha Akash</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K+C">Kevin Chen-Chuan Chang</a>, <a href="/search/cs?searchtype=author&query=Popa%2C+L">Lucian Popa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15511v1-abstract-short" style="display: inline;"> Open-domain long-form text generation requires generating coherent, comprehensive responses that address complex queries with both breadth and depth. This task is challenging due to the need to accurately capture diverse facets of input queries. Existing iterative retrieval-augmented generation (RAG) approaches often struggle to delve deeply into each facet of complex queries and integrate knowled… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15511v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15511v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15511v1-abstract-full" style="display: none;"> Open-domain long-form text generation requires generating coherent, comprehensive responses that address complex queries with both breadth and depth. This task is challenging due to the need to accurately capture diverse facets of input queries. Existing iterative retrieval-augmented generation (RAG) approaches often struggle to delve deeply into each facet of complex queries and integrate knowledge from various sources effectively. This paper introduces ConTReGen, a novel framework that employs a context-driven, tree-structured retrieval approach to enhance the depth and relevance of retrieved content. ConTReGen integrates a hierarchical, top-down in-depth exploration of query facets with a systematic bottom-up synthesis, ensuring comprehensive coverage and coherent integration of multifaceted information. Extensive experiments on multiple datasets, including LFQA and ODSUM, alongside a newly introduced dataset, ODSUM-WikiHow, demonstrate that ConTReGen outperforms existing state-of-the-art RAG models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15511v1-abstract-full').style.display = 'none'; document.getElementById('2410.15511v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP'24 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15277">arXiv:2410.15277</a> <span> [<a href="https://arxiv.org/pdf/2410.15277">pdf</a>, <a href="https://arxiv.org/format/2410.15277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BRIEF: Bridging Retrieval and Inference for Multi-hop Reasoning via Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuankai Li</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jia-Chen Gu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15277v1-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) can supplement large language models (LLMs) by integrating external knowledge. However, as the number of retrieved documents increases, the input length to LLMs grows linearly, causing a dramatic increase in latency and a degradation in long-context understanding. This is particularly serious for multi-hop questions that require a chain of reasoning across docu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15277v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15277v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15277v1-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) can supplement large language models (LLMs) by integrating external knowledge. However, as the number of retrieved documents increases, the input length to LLMs grows linearly, causing a dramatic increase in latency and a degradation in long-context understanding. This is particularly serious for multi-hop questions that require a chain of reasoning across documents. To accelerate inference, reduce costs, and minimize distractions, this paper presents BRIEF (Bridging Retrieval and Inference through Evidence Fusion), a lightweight approach that performs query-aware multi-hop reasoning by compressing retrieved documents into highly dense textual summaries to integrate into in-context learning. To enable learning compression for multi-hop reasoning, we curate synthetic data by extracting atomic proposition expressions that encapsulate distinct factoids from the source documents to compose synthetic summaries. Based on our synthetic data built entirely by open-source models, BRIEF generates more concise summaries and enables a range of LLMs to achieve exceptional open-domain question answering (QA) performance. For example, on HotpotQA, BRIEF improves the compression rate by 2 times compared to the state-of-the-art baseline, while outperforming it by 3.00% EM and 4.16% F1 with Flan-UL2 as the reader LM. It also generates more concise summaries than proprietary GPT-3.5, while demonstrating nearly identical QA performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15277v1-abstract-full').style.display = 'none'; document.getElementById('2410.15277v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://jasonforjoy.github.io/BRIEF/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14978">arXiv:2410.14978</a> <span> [<a href="https://arxiv.org/pdf/2410.14978">pdf</a>, <a href="https://arxiv.org/format/2410.14978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Subversive Characters and Stereotyping Readers: Characterizing Queer Relationalities with Dialogue-Based Relation Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+K+K">Kent K. Chang</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+A">Anna Ho</a>, <a href="/search/cs?searchtype=author&query=Bamman%2C+D">David Bamman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14978v2-abstract-short" style="display: inline;"> Television is often seen as a site for subcultural identification and subversive fantasy, including in queer cultures. How might we measure subversion, or the degree to which the depiction of social relationship between a dyad (e.g. two characters who are colleagues) deviates from its typical representation on TV? To explore this question, we introduce the task of stereotypic relationship extracti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14978v2-abstract-full').style.display = 'inline'; document.getElementById('2410.14978v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14978v2-abstract-full" style="display: none;"> Television is often seen as a site for subcultural identification and subversive fantasy, including in queer cultures. How might we measure subversion, or the degree to which the depiction of social relationship between a dyad (e.g. two characters who are colleagues) deviates from its typical representation on TV? To explore this question, we introduce the task of stereotypic relationship extraction. Built on cognitive stylistics, linguistic anthropology, and dialogue relation extraction, in this paper, we attempt to model the cognitive process of stereotyping TV characters in dialogic interactions. Given a dyad, we want to predict: what social relationship do the speakers exhibit through their words? Subversion is then characterized by the discrepancy between the distribution of the model's predictions and the ground truth labels. To demonstrate the usefulness of this task and gesture at a methodological intervention, we enclose four case studies to characterize the representation of queer relationalities in the Big Bang Theory, Frasier, and Gilmore Girls, as we explore the suspicious and reparative modes of reading with our computational methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14978v2-abstract-full').style.display = 'none'; document.getElementById('2410.14978v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CHR 2024: Computational Humanities Research Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13111">arXiv:2410.13111</a> <span> [<a href="https://arxiv.org/pdf/2410.13111">pdf</a>, <a href="https://arxiv.org/ps/2410.13111">ps</a>, <a href="https://arxiv.org/format/2410.13111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Controllable Generation via Locally Constrained Resampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ahmed%2C+K">Kareem Ahmed</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Broeck%2C+G+V+d">Guy Van den Broeck</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13111v1-abstract-short" style="display: inline;"> Autoregressive models have demonstrated an unprecedented ability at modeling the intricacies of natural language. However, they continue to struggle with generating complex outputs that adhere to logical constraints. Sampling from a fully-independent distribution subject to a constraint is hard. Sampling from an autoregressive distribution subject to a constraint is doubly hard: We have to contend… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13111v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13111v1-abstract-full" style="display: none;"> Autoregressive models have demonstrated an unprecedented ability at modeling the intricacies of natural language. However, they continue to struggle with generating complex outputs that adhere to logical constraints. Sampling from a fully-independent distribution subject to a constraint is hard. Sampling from an autoregressive distribution subject to a constraint is doubly hard: We have to contend not only with the hardness of the constraint but also the distribution's lack of structure. We propose a tractable probabilistic approach that performs Bayesian conditioning to draw samples subject to a constraint. Our approach considers the entire sequence, leading to a more globally optimal constrained generation than current greedy methods. Starting from a model sample, we induce a local, factorized distribution which we can tractably condition on the constraint. To generate samples that satisfy the constraint, we sample from the conditional distribution, correct for biases in the samples and resample. The resulting samples closely approximate the target distribution and are guaranteed to satisfy the constraints. We evaluate our approach on several tasks, including LLM detoxification and solving Sudoku puzzles. We show that by disallowing a list of toxic expressions our approach is able to steer the model's outputs away from toxic generations, outperforming similar approaches to detoxification. We conclude by showing that our approach achieves a perfect accuracy on Sudoku compared to <50% for GPT4-o and Gemini 1.5. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13111v1-abstract-full').style.display = 'none'; document.getElementById('2410.13111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2312.03905</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12029">arXiv:2410.12029</a> <span> [<a href="https://arxiv.org/pdf/2410.12029">pdf</a>, <a href="https://arxiv.org/format/2410.12029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> On Classification with Large Language Models in Cultural Analytics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bamman%2C+D">David Bamman</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K+K">Kent K. Chang</a>, <a href="/search/cs?searchtype=author&query=Lucy%2C+L">Li Lucy</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+N">Naitian Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12029v1-abstract-short" style="display: inline;"> In this work, we survey the way in which classification is used as a sensemaking practice in cultural analytics, and assess where large language models can fit into this landscape. We identify ten tasks supported by publicly available datasets on which we empirically assess the performance of LLMs compared to traditional supervised methods, and explore the ways in which LLMs can be employed for se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12029v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12029v1-abstract-full" style="display: none;"> In this work, we survey the way in which classification is used as a sensemaking practice in cultural analytics, and assess where large language models can fit into this landscape. We identify ten tasks supported by publicly available datasets on which we empirically assess the performance of LLMs compared to traditional supervised methods, and explore the ways in which LLMs can be employed for sensemaking goals beyond mere accuracy. We find that prompt-based LLMs are competitive with traditional supervised models for established tasks, but perform less well on de novo tasks. In addition, LLMs can assist sensemaking by acting as an intermediary input to formal theory testing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12029v1-abstract-full').style.display = 'none'; document.getElementById('2410.12029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> CHR 2024: Computational Humanities Research Conference </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10813">arXiv:2410.10813</a> <span> [<a href="https://arxiv.org/pdf/2410.10813">pdf</a>, <a href="https://arxiv.org/format/2410.10813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LongMemEval: Benchmarking Chat Assistants on Long-Term Interactive Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongwei Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10813v1-abstract-short" style="display: inline;"> Recent large language model (LLM)-driven chat assistant systems have integrated memory components to track user-assistant chat histories, enabling more accurate and personalized responses. However, their long-term memory capabilities in sustained interactions remain underexplored. This paper introduces LongMemEval, a comprehensive benchmark designed to evaluate five core long-term memory abilities… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10813v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10813v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10813v1-abstract-full" style="display: none;"> Recent large language model (LLM)-driven chat assistant systems have integrated memory components to track user-assistant chat histories, enabling more accurate and personalized responses. However, their long-term memory capabilities in sustained interactions remain underexplored. This paper introduces LongMemEval, a comprehensive benchmark designed to evaluate five core long-term memory abilities of chat assistants: information extraction, multi-session reasoning, temporal reasoning, knowledge updates, and abstention. With 500 meticulously curated questions embedded within freely scalable user-assistant chat histories, LongMemEval presents a significant challenge to existing long-term memory systems, with commercial chat assistants and long-context LLMs showing 30% accuracy drop on memorizing information across sustained interactions. We then present a unified framework that breaks down the long-term memory design into four design choices across the indexing, retrieval, and reading stages. Built upon key experimental insights, we propose several memory designs including session decomposition for optimizing value granularity, fact-augmented key expansion for enhancing the index structure, and time-aware query expansion for refining the search scope. Experiment results show that these optimizations greatly improve both memory recall and downstream question answering on LongMemEval. Overall, our study provides valuable resources and guidance for advancing the long-term memory capabilities of LLM-based chat assistants, paving the way toward more personalized and reliable conversational AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10813v1-abstract-full').style.display = 'none'; document.getElementById('2410.10813v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09326">arXiv:2410.09326</a> <span> [<a href="https://arxiv.org/pdf/2410.09326">pdf</a>, <a href="https://arxiv.org/format/2410.09326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> QOPS: A Compiler Framework for Quantum Circuit Simulation Acceleration with Profile Guided Optimizations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu-Tsung Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Po-Hsuan Huang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Chieh Chang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+C">Chia-Heng Tu</a>, <a href="/search/cs?searchtype=author&query=Hung%2C+S">Shih-Hao Hung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09326v2-abstract-short" style="display: inline;"> Quantum circuit simulation is important in the evolution of quantum software and hardware. Novel algorithms can be developed and evaluated by performing quantum circuit simulations on classical computers before physical quantum computers are available. Unfortunately, compared with a physical quantum computer, a prolonged simulation time hampers the rapid development of quantum algorithms. Inspired… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09326v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09326v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09326v2-abstract-full" style="display: none;"> Quantum circuit simulation is important in the evolution of quantum software and hardware. Novel algorithms can be developed and evaluated by performing quantum circuit simulations on classical computers before physical quantum computers are available. Unfortunately, compared with a physical quantum computer, a prolonged simulation time hampers the rapid development of quantum algorithms. Inspired by the feedback-directed optimization scheme used by classical compilers to improve the generated code, this work proposes a quantum compiler framework QOPS to enable profile-guided optimization (PGO) for quantum circuit simulation acceleration. The QOPS compiler instruments a quantum simulator to collect performance data during the circuit simulation and it then generates the optimized version of the quantum circuit based on the collected data. Experimental results show the PGO can effectively shorten the simulation time on our tested benchmark programs. Especially, the simulator-specific PGO (virtual swap) can be applied to the benchmarks to accelerate the simulation speed by a factor of 1.19. As for the hardware-independent PGO, compared with the brute force mechanism (turning on all available compilation flags), which achieves 21% performance improvement against the non-optimized version, the PGO can achieve 16% speedup with a factor of 63 less compilation time than the brute force approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09326v2-abstract-full').style.display = 'none'; document.getElementById('2410.09326v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08182">arXiv:2410.08182</a> <span> [<a href="https://arxiv.org/pdf/2410.08182">pdf</a>, <a href="https://arxiv.org/format/2410.08182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MRAG-Bench: Vision-Centric Evaluation for Retrieval-Augmented Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenbo Hu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jia-Chen Gu</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zi-Yi Dou</a>, <a href="/search/cs?searchtype=author&query=Fayyaz%2C+M">Mohsen Fayyaz</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+P">Pan Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08182v1-abstract-short" style="display: inline;"> Existing multimodal retrieval benchmarks primarily focus on evaluating whether models can retrieve and utilize external textual knowledge for question answering. However, there are scenarios where retrieving visual information is either more beneficial or easier to access than textual data. In this paper, we introduce a multimodal retrieval-augmented generation benchmark, MRAG-Bench, in which we s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08182v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08182v1-abstract-full" style="display: none;"> Existing multimodal retrieval benchmarks primarily focus on evaluating whether models can retrieve and utilize external textual knowledge for question answering. However, there are scenarios where retrieving visual information is either more beneficial or easier to access than textual data. In this paper, we introduce a multimodal retrieval-augmented generation benchmark, MRAG-Bench, in which we systematically identify and categorize scenarios where visually augmented knowledge is better than textual knowledge, for instance, more images from varying viewpoints. MRAG-Bench consists of 16,130 images and 1,353 human-annotated multiple-choice questions across 9 distinct scenarios. With MRAG-Bench, we conduct an evaluation of 10 open-source and 4 proprietary large vision-language models (LVLMs). Our results show that all LVLMs exhibit greater improvements when augmented with images compared to textual knowledge, confirming that MRAG-Bench is vision-centric. Additionally, we conduct extensive analysis with MRAG-Bench, which offers valuable insights into retrieval-augmented LVLMs. Notably, the top-performing model, GPT-4o, faces challenges in effectively leveraging retrieved knowledge, achieving only a 5.82% improvement with ground-truth information, in contrast to a 33.16% improvement observed in human participants. These findings highlight the importance of MRAG-Bench in encouraging the community to enhance LVLMs' ability to utilize retrieved visual knowledge more effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08182v1-abstract-full').style.display = 'none'; document.getElementById('2410.08182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://mragbench.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05559">arXiv:2410.05559</a> <span> [<a href="https://arxiv.org/pdf/2410.05559">pdf</a>, <a href="https://arxiv.org/format/2410.05559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Attribute Controlled Fine-tuning for Large Language Models: A Case Study on Detoxification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+T">Tao Meng</a>, <a href="/search/cs?searchtype=author&query=Mehrabi%2C+N">Ninareh Mehrabi</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+P">Palash Goyal</a>, <a href="/search/cs?searchtype=author&query=Ramakrishna%2C+A">Anil Ramakrishna</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Zemel%2C+R">Richard Zemel</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+R">Rahul Gupta</a>, <a href="/search/cs?searchtype=author&query=Peris%2C+C">Charith Peris</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05559v1-abstract-short" style="display: inline;"> We propose a constraint learning schema for fine-tuning Large Language Models (LLMs) with attribute control. Given a training corpus and control criteria formulated as a sequence-level constraint on model outputs, our method fine-tunes the LLM on the training corpus while enhancing constraint satisfaction with minimal impact on its utility and generation quality. Specifically, our approach regular… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05559v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05559v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05559v1-abstract-full" style="display: none;"> We propose a constraint learning schema for fine-tuning Large Language Models (LLMs) with attribute control. Given a training corpus and control criteria formulated as a sequence-level constraint on model outputs, our method fine-tunes the LLM on the training corpus while enhancing constraint satisfaction with minimal impact on its utility and generation quality. Specifically, our approach regularizes the LLM training by penalizing the KL divergence between the desired output distribution, which satisfies the constraints, and the LLM's posterior. This regularization term can be approximated by an auxiliary model trained to decompose the sequence-level constraints into token-level guidance, allowing the term to be measured by a closed-form formulation. To further improve efficiency, we design a parallel scheme for concurrently updating both the LLM and the auxiliary model. We evaluate the empirical performance of our approach by controlling the toxicity when training an LLM. We show that our approach leads to an LLM that produces fewer inappropriate responses while achieving competitive performance on benchmarks and a toxicity detection task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05559v1-abstract-full').style.display = 'none'; document.getElementById('2410.05559v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05269">arXiv:2410.05269</a> <span> [<a href="https://arxiv.org/pdf/2410.05269">pdf</a>, <a href="https://arxiv.org/format/2410.05269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data Advisor: Dynamic Data Curation for Safety Alignment of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Mehrabi%2C+N">Ninareh Mehrabi</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+P">Palash Goyal</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+R">Rahul Gupta</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05269v1-abstract-short" style="display: inline;"> Data is a crucial element in large language model (LLM) alignment. Recent studies have explored using LLMs for efficient data collection. However, LLM-generated data often suffers from quality issues, with underrepresented or absent aspects and low-quality datapoints. To address these problems, we propose Data Advisor, an enhanced LLM-based method for generating data that takes into account the ch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05269v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05269v1-abstract-full" style="display: none;"> Data is a crucial element in large language model (LLM) alignment. Recent studies have explored using LLMs for efficient data collection. However, LLM-generated data often suffers from quality issues, with underrepresented or absent aspects and low-quality datapoints. To address these problems, we propose Data Advisor, an enhanced LLM-based method for generating data that takes into account the characteristics of the desired dataset. Starting from a set of pre-defined principles in hand, Data Advisor monitors the status of the generated data, identifies weaknesses in the current dataset, and advises the next iteration of data generation accordingly. Data Advisor can be easily integrated into existing data generation methods to enhance data quality and coverage. Experiments on safety alignment of three representative LLMs (i.e., Mistral, Llama2, and Falcon) demonstrate the effectiveness of Data Advisor in enhancing model safety against various fine-grained safety issues without sacrificing model utility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05269v1-abstract-full').style.display = 'none'; document.getElementById('2410.05269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024 Main Conference. Project website: https://feiwang96.github.io/DataAdvisor/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04628">arXiv:2410.04628</a> <span> [<a href="https://arxiv.org/pdf/2410.04628">pdf</a>, <a href="https://arxiv.org/format/2410.04628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Control Large Language Models via Divide and Conquer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bingxuan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+T">Tao Meng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04628v1-abstract-short" style="display: inline;"> This paper investigates controllable generation for large language models (LLMs) with prompt-based control, focusing on Lexically Constrained Generation (LCG). We systematically evaluate the performance of LLMs on satisfying lexical constraints with prompt-based control, as well as their efficacy in downstream applications. We conclude that LLMs face significant challenges in consistently satisfyi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04628v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04628v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04628v1-abstract-full" style="display: none;"> This paper investigates controllable generation for large language models (LLMs) with prompt-based control, focusing on Lexically Constrained Generation (LCG). We systematically evaluate the performance of LLMs on satisfying lexical constraints with prompt-based control, as well as their efficacy in downstream applications. We conclude that LLMs face significant challenges in consistently satisfying lexical constraints with prompt-based control. We identified three key limitations of LLMs for LCG, including (1) position bias, where LLMs tend to satisfy constraints that appear in specific positions within the input; (2) low responsiveness to decoding parameters, which render minimal impact on control of LLMs; and (3) struggle with handling the inherent complexity of certain constraints (e.g., compound words). To address these issues, we introduce a Divide and Conquer Generation strategy, effective for both white-box and black-box LLMs, to enhance LLMs performance in LCG tasks, which demonstrates over 90% improvement on success rate in the most challenging LCG task. Our analysis provides valuable insights into the performance of LLMs in LCG with prompt-based control, and our proposed strategy offers a pathway to more sophisticated and customized text generation applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04628v1-abstract-full').style.display = 'none'; document.getElementById('2410.04628v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03071">arXiv:2410.03071</a> <span> [<a href="https://arxiv.org/pdf/2410.03071">pdf</a>, <a href="https://arxiv.org/format/2410.03071">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Short-Text Topic Modeling with LLM-Driven Context Expansion and Prefix-Tuned VAEs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Akash%2C+P+S">Pritom Saha Akash</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K+C">Kevin Chen-Chuan Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03071v2-abstract-short" style="display: inline;"> Topic modeling is a powerful technique for uncovering hidden themes within a collection of documents. However, the effectiveness of traditional topic models often relies on sufficient word co-occurrence, which is lacking in short texts. Therefore, existing approaches, whether probabilistic or neural, frequently struggle to extract meaningful patterns from such data, resulting in incoherent topics.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03071v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03071v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03071v2-abstract-full" style="display: none;"> Topic modeling is a powerful technique for uncovering hidden themes within a collection of documents. However, the effectiveness of traditional topic models often relies on sufficient word co-occurrence, which is lacking in short texts. Therefore, existing approaches, whether probabilistic or neural, frequently struggle to extract meaningful patterns from such data, resulting in incoherent topics. To address this challenge, we propose a novel approach that leverages large language models (LLMs) to extend short texts into more detailed sequences before applying topic modeling. To further improve the efficiency and solve the problem of semantic inconsistency from LLM-generated texts, we propose to use prefix tuning to train a smaller language model coupled with a variational autoencoder for short-text topic modeling. Our method significantly improves short-text topic modeling performance, as demonstrated by extensive experiments on real-world datasets with extreme data sparsity, outperforming current state-of-the-art topic models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03071v2-abstract-full').style.display = 'none'; document.getElementById('2410.03071v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP Findings 2024. arXiv admin note: substantial text overlap with arXiv:2310.15420</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00120">arXiv:2410.00120</a> <span> [<a href="https://arxiv.org/pdf/2410.00120">pdf</a>, <a href="https://arxiv.org/format/2410.00120">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning to Swim: Reinforcement Learning for 6-DOF Control of Thruster-driven Autonomous Underwater Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+L">Levi Cai</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kevin Chang</a>, <a href="/search/cs?searchtype=author&query=Girdhar%2C+Y">Yogesh Girdhar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00120v1-abstract-short" style="display: inline;"> Controlling AUVs can be challenging because of the effect of complex non-linear hydrodynamic forces acting on the robot, which, unlike ground robots, are significant in water and cannot be ignored. The problem is especially challenging for small AUVs for which the dynamics can change significantly with payload changes and deployments under different water conditions. The common approach to AUV con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00120v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00120v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00120v1-abstract-full" style="display: none;"> Controlling AUVs can be challenging because of the effect of complex non-linear hydrodynamic forces acting on the robot, which, unlike ground robots, are significant in water and cannot be ignored. The problem is especially challenging for small AUVs for which the dynamics can change significantly with payload changes and deployments under different water conditions. The common approach to AUV control is a combination of passive stabilization with added buoyancy on top and weights on the bottom, and a PID controller tuned for simple and smooth motion primitives. However, the approach comes at the cost of sluggish controls and often the need to re-tune controllers with configuration changes. We propose a fast (trainable in minutes), reinforcement learning based approach for full 6 degree of freedom (DOF) control of an AUV, enabled by a new, highly parallelized simulator for underwater vehicle dynamics. We demonstrate that the proposed simulator models approximate hydrodynamic forces with enough accuracy that a zero-shot transfer of the learned policy to a real robot produces performance comparable to a hand-tuned PID controller. Furthermore, we show that domain randomization on the simulator produces policies that are robust to small variations in vehicle's physical parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00120v1-abstract-full').style.display = 'none'; document.getElementById('2410.00120v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17958">arXiv:2409.17958</a> <span> [<a href="https://arxiv.org/pdf/2409.17958">pdf</a>, <a href="https://arxiv.org/format/2409.17958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Hard Positive Truth about Vision-Language Compositionality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kamath%2C+A">Amita Kamath</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+C">Cheng-Yu Hsieh</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17958v1-abstract-short" style="display: inline;"> Several benchmarks have concluded that our best vision-language models (e.g., CLIP) are lacking in compositionality. Given an image, these benchmarks probe a model's ability to identify its associated caption amongst a set of compositional distractors. In response, a surge of recent proposals show improvements by finetuning CLIP with distractors as hard negatives. Our investigations reveal that th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17958v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17958v1-abstract-full" style="display: none;"> Several benchmarks have concluded that our best vision-language models (e.g., CLIP) are lacking in compositionality. Given an image, these benchmarks probe a model's ability to identify its associated caption amongst a set of compositional distractors. In response, a surge of recent proposals show improvements by finetuning CLIP with distractors as hard negatives. Our investigations reveal that these improvements have, in fact, been significantly overstated -- because existing benchmarks do not probe whether finetuned vision-language models remain invariant to hard positives. By curating an evaluation dataset with 112,382 hard negatives and hard positives, we uncover that including hard positives decreases CLIP's performance by 12.9%, while humans perform effortlessly at 99%. CLIP finetuned with hard negatives results in an even larger decrease, up to 38.7%. With this finding, we then produce a 1,775,259 image-text training set with both hard negative and hard positive captions. By training with both, we see improvements on existing benchmarks while simultaneously improving performance on hard positives, indicating a more robust improvement in compositionality. Our work suggests the need for future research to rigorously test and improve CLIP's understanding of semantic relationships between related "positive" concepts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17958v1-abstract-full').style.display = 'none'; document.getElementById('2409.17958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14085">arXiv:2409.14085</a> <span> [<a href="https://arxiv.org/pdf/2409.14085">pdf</a>, <a href="https://arxiv.org/format/2409.14085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Codec-SUPERB @ SLT 2024: A lightweight benchmark for neural audio codec models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haibin Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yi-Cheng Lin</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kaiwei Chang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jiawei Du</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A+H">Alexander H. Liu</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H">Ho-Lam Chung</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuan-Kuei Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi-Chiao Wu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&query=Glass%2C+J">James Glass</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14085v1-abstract-short" style="display: inline;"> Neural audio codec models are becoming increasingly important as they serve as tokenizers for audio, enabling efficient transmission or facilitating speech language modeling. The ideal neural audio codec should maintain content, paralinguistics, speaker characteristics, and audio information even at low bitrates. Recently, numerous advanced neural codec models have been proposed. However, codec mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14085v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14085v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14085v1-abstract-full" style="display: none;"> Neural audio codec models are becoming increasingly important as they serve as tokenizers for audio, enabling efficient transmission or facilitating speech language modeling. The ideal neural audio codec should maintain content, paralinguistics, speaker characteristics, and audio information even at low bitrates. Recently, numerous advanced neural codec models have been proposed. However, codec models are often tested under varying experimental conditions. As a result, we introduce the Codec-SUPERB challenge at SLT 2024, designed to facilitate fair and lightweight comparisons among existing codec models and inspire advancements in the field. This challenge brings together representative speech applications and objective metrics, and carefully selects license-free datasets, sampling them into small sets to reduce evaluation computation costs. This paper presents the challenge's rules, datasets, five participant systems, results, and findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14085v1-abstract-full').style.display = 'none'; document.getElementById('2409.14085v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12953">arXiv:2409.12953</a> <span> [<a href="https://arxiv.org/pdf/2409.12953">pdf</a>, <a href="https://arxiv.org/format/2409.12953">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> JourneyBench: A Challenging One-Stop Vision-Language Understanding Benchmark of Generated Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhecan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junzhang Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chia-Wei Tang</a>, <a href="/search/cs?searchtype=author&query=Alomari%2C+H">Hani Alomari</a>, <a href="/search/cs?searchtype=author&query=Sivakumar%2C+A">Anushka Sivakumar</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Rui Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenhao Li</a>, <a href="/search/cs?searchtype=author&query=Atabuzzaman%2C+M">Md. Atabuzzaman</a>, <a href="/search/cs?searchtype=author&query=Ayyubi%2C+H">Hammad Ayyubi</a>, <a href="/search/cs?searchtype=author&query=You%2C+H">Haoxuan You</a>, <a href="/search/cs?searchtype=author&query=Ishmam%2C+A">Alvi Ishmam</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+S">Shih-Fu Chang</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+C">Chris Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12953v3-abstract-short" style="display: inline;"> Existing vision-language understanding benchmarks largely consist of images of objects in their usual contexts. As a consequence, recent multimodal large language models can perform well with only a shallow visual understanding by relying on background language biases. Thus, strong performance on these benchmarks does not necessarily correlate with strong visual understanding. In this paper, we re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12953v3-abstract-full').style.display = 'inline'; document.getElementById('2409.12953v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12953v3-abstract-full" style="display: none;"> Existing vision-language understanding benchmarks largely consist of images of objects in their usual contexts. As a consequence, recent multimodal large language models can perform well with only a shallow visual understanding by relying on background language biases. Thus, strong performance on these benchmarks does not necessarily correlate with strong visual understanding. In this paper, we release JourneyBench, a comprehensive human-annotated benchmark of generated images designed to assess the model's fine-grained multimodal reasoning abilities across five tasks: complementary multimodal chain of thought, multi-image VQA, imaginary image captioning, VQA with hallucination triggers, and fine-grained retrieval with sample-specific distractors. Unlike existing benchmarks, JourneyBench explicitly requires fine-grained multimodal reasoning in unusual imaginary scenarios where language bias and holistic image gist are insufficient. We benchmark state-of-the-art models on JourneyBench and analyze performance along a number of fine-grained dimensions. Results across all five tasks show that JourneyBench is exceptionally challenging for even the best models, indicating that models' visual reasoning abilities are not as strong as they first appear. We discuss the implications of our findings and propose avenues for further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12953v3-abstract-full').style.display = 'none'; document.getElementById('2409.12953v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10783">arXiv:2409.10783</a> <span> [<a href="https://arxiv.org/pdf/2409.10783">pdf</a>, <a href="https://arxiv.org/format/2409.10783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Predicting Punctuation in Ancient Chinese Texts: A Multi-Layered LSTM and Attention-Based Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tracy Cai</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kimmy Chang</a>, <a href="/search/cs?searchtype=author&query=Nabi%2C+F">Fahad Nabi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10783v1-abstract-short" style="display: inline;"> It was only until the 20th century when the Chinese language began using punctuation. In fact, many ancient Chinese texts contain thousands of lines with no distinct punctuation marks or delimiters in sight. The lack of punctuation in such texts makes it difficult for humans to identify when there pauses or breaks between particular phrases and understand the semantic meaning of the written text (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10783v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10783v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10783v1-abstract-full" style="display: none;"> It was only until the 20th century when the Chinese language began using punctuation. In fact, many ancient Chinese texts contain thousands of lines with no distinct punctuation marks or delimiters in sight. The lack of punctuation in such texts makes it difficult for humans to identify when there pauses or breaks between particular phrases and understand the semantic meaning of the written text (Mogahed, 2012). As a result, unless one was educated in the ancient time period, many readers of ancient Chinese would have significantly different interpretations of the texts. We propose an approach to predict the location (and type) of punctuation in ancient Chinese texts that extends the work of Oh et al (2017) by leveraging a bidirectional multi-layered LSTM with a multi-head attention mechanism as inspired by Luong et al.'s (2015) discussion of attention-based architectures. We find that the use of multi-layered LSTMs and multi-head attention significantly outperforms RNNs that don't incorporate such components when evaluating ancient Chinese texts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10783v1-abstract-full').style.display = 'none'; document.getElementById('2409.10783v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07326">arXiv:2409.07326</a> <span> [<a href="https://arxiv.org/pdf/2409.07326">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ART: Artifact Removal Transformer for Reconstructing Noise-Free Multichannel Electroencephalographic Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chuang%2C+C">Chun-Hsiang Chuang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kong-Yi Chang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chih-Sheng Huang</a>, <a href="/search/cs?searchtype=author&query=Bessas%2C+A">Anne-Mei Bessas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07326v1-abstract-short" style="display: inline;"> Artifact removal in electroencephalography (EEG) is a longstanding challenge that significantly impacts neuroscientific analysis and brain-computer interface (BCI) performance. Tackling this problem demands advanced algorithms, extensive noisy-clean training data, and thorough evaluation strategies. This study presents the Artifact Removal Transformer (ART), an innovative EEG denoising model emplo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07326v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07326v1-abstract-full" style="display: none;"> Artifact removal in electroencephalography (EEG) is a longstanding challenge that significantly impacts neuroscientific analysis and brain-computer interface (BCI) performance. Tackling this problem demands advanced algorithms, extensive noisy-clean training data, and thorough evaluation strategies. This study presents the Artifact Removal Transformer (ART), an innovative EEG denoising model employing transformer architecture to adeptly capture the transient millisecond-scale dynamics characteristic of EEG signals. Our approach offers a holistic, end-to-end denoising solution for diverse artifact types in multichannel EEG data. We enhanced the generation of noisy-clean EEG data pairs using an independent component analysis, thus fortifying the training scenarios critical for effective supervised learning. We performed comprehensive validations using a wide range of open datasets from various BCI applications, employing metrics like mean squared error and signal-to-noise ratio, as well as sophisticated techniques such as source localization and EEG component classification. Our evaluations confirm that ART surpasses other deep-learning-based artifact removal methods, setting a new benchmark in EEG signal processing. This advancement not only boosts the accuracy and reliability of artifact removal but also promises to catalyze further innovations in the field, facilitating the study of brain dynamics in naturalistic environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07326v1-abstract-full').style.display = 'none'; document.getElementById('2409.07326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03363">arXiv:2409.03363</a> <span> [<a href="https://arxiv.org/pdf/2409.03363">pdf</a>, <a href="https://arxiv.org/format/2409.03363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Con-ReCall: Detecting Pre-training Data in LLMs via Contrastive Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Hooi%2C+B">Bryan Hooi</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yujun Cai</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03363v1-abstract-short" style="display: inline;"> The training data in large language models is key to their success, but it also presents privacy and security risks, as it may contain sensitive information. Detecting pre-training data is crucial for mitigating these concerns. Existing methods typically analyze target text in isolation or solely with non-member contexts, overlooking potential insights from simultaneously considering both member a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03363v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03363v1-abstract-full" style="display: none;"> The training data in large language models is key to their success, but it also presents privacy and security risks, as it may contain sensitive information. Detecting pre-training data is crucial for mitigating these concerns. Existing methods typically analyze target text in isolation or solely with non-member contexts, overlooking potential insights from simultaneously considering both member and non-member contexts. While previous work suggested that member contexts provide little information due to the minor distributional shift they induce, our analysis reveals that these subtle shifts can be effectively leveraged when contrasted with non-member contexts. In this paper, we propose Con-ReCall, a novel approach that leverages the asymmetric distributional shifts induced by member and non-member contexts through contrastive decoding, amplifying subtle differences to enhance membership inference. Extensive empirical evaluations demonstrate that Con-ReCall achieves state-of-the-art performance on the WikiMIA benchmark and is robust against various text manipulation techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03363v1-abstract-full').style.display = 'none'; document.getElementById('2409.03363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01037">arXiv:2409.01037</a> <span> [<a href="https://arxiv.org/pdf/2409.01037">pdf</a>, <a href="https://arxiv.org/format/2409.01037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> NYK-MS: A Well-annotated Multi-modal Metaphor and Sarcasm Understanding Benchmark on Cartoon-Caption Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+K">Ke Chang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junzhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yunfang Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01037v1-abstract-short" style="display: inline;"> Metaphor and sarcasm are common figurative expressions in people's communication, especially on the Internet or the memes popular among teenagers. We create a new benchmark named NYK-MS (NewYorKer for Metaphor and Sarcasm), which contains 1,583 samples for metaphor understanding tasks and 1,578 samples for sarcasm understanding tasks. These tasks include whether it contains metaphor/sarcasm, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01037v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01037v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01037v1-abstract-full" style="display: none;"> Metaphor and sarcasm are common figurative expressions in people's communication, especially on the Internet or the memes popular among teenagers. We create a new benchmark named NYK-MS (NewYorKer for Metaphor and Sarcasm), which contains 1,583 samples for metaphor understanding tasks and 1,578 samples for sarcasm understanding tasks. These tasks include whether it contains metaphor/sarcasm, which word or object contains metaphor/sarcasm, what does it satirize and why does it contains metaphor/sarcasm, all of the 7 tasks are well-annotated by at least 3 annotators. We annotate the dataset for several rounds to improve the consistency and quality, and use GUI and GPT-4V to raise our efficiency. Based on the benchmark, we conduct plenty of experiments. In the zero-shot experiments, we show that Large Language Models (LLM) and Large Multi-modal Models (LMM) can't do classification task well, and as the scale increases, the performance on other 5 tasks improves. In the experiments on traditional pre-train models, we show the enhancement with augment and alignment methods, which prove our benchmark is consistent with previous dataset and requires the model to understand both of the two modalities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01037v1-abstract-full').style.display = 'none'; document.getElementById('2409.01037v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14262">arXiv:2408.14262</a> <span> [<a href="https://arxiv.org/pdf/2408.14262">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised Speech Representations Still Struggle with African American Vernacular English </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kalvin Chang</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+Y">Yi-Hui Chou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hsuan-Ming Chen</a>, <a href="/search/cs?searchtype=author&query=Holliday%2C+N">Nicole Holliday</a>, <a href="/search/cs?searchtype=author&query=Scharenborg%2C+O">Odette Scharenborg</a>, <a href="/search/cs?searchtype=author&query=Mortensen%2C+D+R">David R. Mortensen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14262v1-abstract-short" style="display: inline;"> Underperformance of ASR systems for speakers of African American Vernacular English (AAVE) and other marginalized language varieties is a well-documented phenomenon, and one that reinforces the stigmatization of these varieties. We investigate whether or not the recent wave of Self-Supervised Learning (SSL) speech models can close the gap in ASR performance between AAVE and Mainstream American Eng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14262v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14262v1-abstract-full" style="display: none;"> Underperformance of ASR systems for speakers of African American Vernacular English (AAVE) and other marginalized language varieties is a well-documented phenomenon, and one that reinforces the stigmatization of these varieties. We investigate whether or not the recent wave of Self-Supervised Learning (SSL) speech models can close the gap in ASR performance between AAVE and Mainstream American English (MAE). We evaluate four SSL models (wav2vec 2.0, HuBERT, WavLM, and XLS-R) on zero-shot Automatic Speech Recognition (ASR) for these two varieties and find that these models perpetuate the bias in performance against AAVE. Additionally, the models have higher word error rates on utterances with more phonological and morphosyntactic features of AAVE. Despite the success of SSL speech models in improving ASR for low resource varieties, SSL pre-training alone may not bridge the gap between AAVE and MAE. Our code is publicly available at https://github.com/cmu-llab/s3m-aave. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14262v1-abstract-full').style.display = 'none'; document.getElementById('2408.14262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13040">arXiv:2408.13040</a> <span> [<a href="https://arxiv.org/pdf/2408.13040">pdf</a>, <a href="https://arxiv.org/format/2408.13040">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TASLP.2024.3436618">10.1109/TASLP.2024.3436618 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SpeechPrompt: Prompting Speech Language Models for Speech Processing Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haibin Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu-Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuan-Kuei Wu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hua Shen</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+I">Iu-thing Kang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shang-Wen Li</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13040v1-abstract-short" style="display: inline;"> Prompting has become a practical method for utilizing pre-trained language models (LMs). This approach offers several advantages. It allows an LM to adapt to new tasks with minimal training and parameter updates, thus achieving efficiency in both storage and computation. Additionally, prompting modifies only the LM's inputs and harnesses the generative capabilities of language models to address va… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13040v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13040v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13040v1-abstract-full" style="display: none;"> Prompting has become a practical method for utilizing pre-trained language models (LMs). This approach offers several advantages. It allows an LM to adapt to new tasks with minimal training and parameter updates, thus achieving efficiency in both storage and computation. Additionally, prompting modifies only the LM's inputs and harnesses the generative capabilities of language models to address various downstream tasks in a unified manner. This significantly reduces the need for human labor in designing task-specific models. These advantages become even more evident as the number of tasks served by the LM scales up. Motivated by the strengths of prompting, we are the first to explore the potential of prompting speech LMs in the domain of speech processing. Recently, there has been a growing interest in converting speech into discrete units for language modeling. Our pioneer research demonstrates that these quantized speech units are highly versatile within our unified prompting framework. Not only can they serve as class labels, but they also contain rich phonetic information that can be re-synthesized back into speech signals for speech generation tasks. Specifically, we reformulate speech processing tasks into speech-to-unit generation tasks. As a result, we can seamlessly integrate tasks such as speech classification, sequence generation, and speech generation within a single, unified prompting framework. The experiment results show that the prompting method can achieve competitive performance compared to the strong fine-tuning method based on self-supervised learning models with a similar number of trainable parameters. The prompting method also shows promising results in the few-shot setting. Moreover, with the advanced speech LMs coming into the stage, the proposed prompting framework attains great potential. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13040v1-abstract-full').style.display = 'none'; document.getElementById('2408.13040v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in IEEE/ACM Transactions on Audio, Speech, and Language Processing (TASLP)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> in IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 32, pp. 3730-3744, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05457">arXiv:2408.05457</a> <span> [<a href="https://arxiv.org/pdf/2408.05457">pdf</a>, <a href="https://arxiv.org/format/2408.05457">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Investigating Instruction Tuning Large Language Models on Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kerui Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Bo-Wei Huang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bowen Jin</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yizhu Jiao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+M">Ming Zhong</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kevin Chang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shou-De Lin</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05457v1-abstract-short" style="display: inline;"> Inspired by the recent advancements of Large Language Models (LLMs) in NLP tasks, there's growing interest in applying LLMs to graph-related tasks. This study delves into the capabilities of instruction-following LLMs for engaging with real-world graphs, aiming to offer empirical insights into how LLMs can effectively interact with graphs and generalize across graph tasks. We begin by constructing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05457v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05457v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05457v1-abstract-full" style="display: none;"> Inspired by the recent advancements of Large Language Models (LLMs) in NLP tasks, there's growing interest in applying LLMs to graph-related tasks. This study delves into the capabilities of instruction-following LLMs for engaging with real-world graphs, aiming to offer empirical insights into how LLMs can effectively interact with graphs and generalize across graph tasks. We begin by constructing a dataset designed for instruction tuning, which comprises a diverse collection of 79 graph-related tasks from academic and e-commerce domains, featuring 44,240 training instances and 18,960 test samples. Utilizing this benchmark, our initial investigation focuses on identifying the optimal graph representation that serves as a conduit for LLMs to understand complex graph structures. Our findings indicate that JSON format for graph representation consistently outperforms natural language and code formats across various LLMs and graph types. Furthermore, we examine the key factors that influence the generalization abilities of instruction-tuned LLMs by evaluating their performance on both in-domain and out-of-domain graph tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05457v1-abstract-full').style.display = 'none'; document.getElementById('2408.05457v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">COLM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01046">arXiv:2408.01046</a> <span> [<a href="https://arxiv.org/pdf/2408.01046">pdf</a>, <a href="https://arxiv.org/format/2408.01046">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> QUDSELECT: Selective Decoding for Questions Under Discussion Parsing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Suvarna%2C+A">Ashima Suvarna</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Parekh%2C+T">Tanmay Parekh</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01046v1-abstract-short" style="display: inline;"> Question Under Discussion (QUD) is a discourse framework that uses implicit questions to reveal discourse relationships between sentences. In QUD parsing, each sentence is viewed as an answer to a question triggered by an anchor sentence in prior context. The resulting QUD structure is required to conform to several theoretical criteria like answer compatibility (how well the question is answered)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01046v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01046v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01046v1-abstract-full" style="display: none;"> Question Under Discussion (QUD) is a discourse framework that uses implicit questions to reveal discourse relationships between sentences. In QUD parsing, each sentence is viewed as an answer to a question triggered by an anchor sentence in prior context. The resulting QUD structure is required to conform to several theoretical criteria like answer compatibility (how well the question is answered), making QUD parsing a challenging task. Previous works construct QUD parsers in a pipelined manner (i.e. detect the trigger sentence in context and then generate the question). However, these parsers lack a holistic view of the task and can hardly satisfy all the criteria. In this work, we introduce QUDSELECT, a joint-training framework that selectively decodes the QUD dependency structures considering the QUD criteria. Using instruction-tuning, we train models to simultaneously predict the anchor sentence and generate the associated question. To explicitly incorporate the criteria, we adopt a selective decoding strategy of sampling multiple QUD candidates during inference, followed by selecting the best one with criteria scorers. Our method outperforms the state-of-the-art baseline models by 9% in human evaluation and 4% in automatic evaluation, demonstrating the effectiveness of our framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01046v1-abstract-full').style.display = 'none'; document.getElementById('2408.01046v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 Pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21358">arXiv:2407.21358</a> <span> [<a href="https://arxiv.org/pdf/2407.21358">pdf</a>, <a href="https://arxiv.org/format/2407.21358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Tree-of-Traversals: A Zero-Shot Reasoning Algorithm for Augmenting Black-box Language Models with Knowledge Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Markowitz%2C+E">Elan Markowitz</a>, <a href="/search/cs?searchtype=author&query=Ramakrishna%2C+A">Anil Ramakrishna</a>, <a href="/search/cs?searchtype=author&query=Dhamala%2C+J">Jwala Dhamala</a>, <a href="/search/cs?searchtype=author&query=Mehrabi%2C+N">Ninareh Mehrabi</a>, <a href="/search/cs?searchtype=author&query=Peris%2C+C">Charith Peris</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+R">Rahul Gupta</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21358v1-abstract-short" style="display: inline;"> Knowledge graphs (KGs) complement Large Language Models (LLMs) by providing reliable, structured, domain-specific, and up-to-date external knowledge. However, KGs and LLMs are often developed separately and must be integrated after training. We introduce Tree-of-Traversals, a novel zero-shot reasoning algorithm that enables augmentation of black-box LLMs with one or more KGs. The algorithm equips… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21358v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21358v1-abstract-full" style="display: none;"> Knowledge graphs (KGs) complement Large Language Models (LLMs) by providing reliable, structured, domain-specific, and up-to-date external knowledge. However, KGs and LLMs are often developed separately and must be integrated after training. We introduce Tree-of-Traversals, a novel zero-shot reasoning algorithm that enables augmentation of black-box LLMs with one or more KGs. The algorithm equips a LLM with actions for interfacing a KG and enables the LLM to perform tree search over possible thoughts and actions to find high confidence reasoning paths. We evaluate on two popular benchmark datasets. Our results show that Tree-of-Traversals significantly improves performance on question answering and KG question answering tasks. Code is available at \url{https://github.com/amazon-science/tree-of-traversals} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21358v1-abstract-full').style.display = 'none'; document.getElementById('2407.21358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at the ACL 2024 Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19283">arXiv:2407.19283</a> <span> [<a href="https://arxiv.org/pdf/2407.19283">pdf</a>, <a href="https://arxiv.org/format/2407.19283">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Smart Contracts, Smarter Payments: Innovating Cross Border Payments and Reporting Transactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mridul%2C+M+A">Maruf Ahmed Mridul</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kaiyang Chang</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Aparna Gupta</a>, <a href="/search/cs?searchtype=author&query=Seneviratne%2C+O">Oshani Seneviratne</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19283v1-abstract-short" style="display: inline;"> The global financial landscape is experiencing significant transformation driven by technological advancements and evolving market dynamics. Moreover, blockchain technology has become a pivotal platform with widespread applications, especially in finance. Cross-border payments have emerged as a key area of interest, with blockchain offering inherent benefits such as enhanced security, transparency… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19283v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19283v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19283v1-abstract-full" style="display: none;"> The global financial landscape is experiencing significant transformation driven by technological advancements and evolving market dynamics. Moreover, blockchain technology has become a pivotal platform with widespread applications, especially in finance. Cross-border payments have emerged as a key area of interest, with blockchain offering inherent benefits such as enhanced security, transparency, and efficiency compared to traditional banking systems. This paper presents a novel framework leveraging blockchain technology and smart contracts to emulate cross-border payments, ensuring interoperability and compliance with international standards such as ISO20022. Key contributions of this paper include a novel prototype framework for implementing smart contracts and web clients for streamlined transactions and a mechanism to translate ISO20022 standard messages. Our framework can provide a practical solution for secure, efficient, and transparent cross-border transactions, contributing to the ongoing evolution of global finance and the emerging landscape of decentralized finance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19283v1-abstract-full').style.display = 'none'; document.getElementById('2407.19283v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 1 figure, 1 table, CIFEr Conference 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08473">arXiv:2407.08473</a> <span> [<a href="https://arxiv.org/pdf/2407.08473">pdf</a>, <a href="https://arxiv.org/format/2407.08473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Natural language is not enough: Benchmarking multi-modal generative AI for Verilog generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kaiyan Chang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhirong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yunhao Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenlong Zhu</a>, <a href="/search/cs?searchtype=author&query=wang%2C+k">kun wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haobo Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cangyuan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengdi Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shengwen Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huawei Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yinhe Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Ying Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08473v1-abstract-short" style="display: inline;"> Natural language interfaces have exhibited considerable potential in the automation of Verilog generation derived from high-level specifications through the utilization of large language models, garnering significant attention. Nevertheless, this paper elucidates that visual representations contribute essential contextual information critical to design intent for hardware architectures possessing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08473v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08473v1-abstract-full" style="display: none;"> Natural language interfaces have exhibited considerable potential in the automation of Verilog generation derived from high-level specifications through the utilization of large language models, garnering significant attention. Nevertheless, this paper elucidates that visual representations contribute essential contextual information critical to design intent for hardware architectures possessing spatial complexity, potentially surpassing the efficacy of natural-language-only inputs. Expanding upon this premise, our paper introduces an open-source benchmark for multi-modal generative models tailored for Verilog synthesis from visual-linguistic inputs, addressing both singular and complex modules. Additionally, we introduce an open-source visual and natural language Verilog query language framework to facilitate efficient and user-friendly multi-modal queries. To evaluate the performance of the proposed multi-modal hardware generative AI in Verilog generation tasks, we compare it with a popular method that relies solely on natural language. Our results demonstrate a significant accuracy improvement in the multi-modal generated Verilog compared to queries based solely on natural language. We hope to reveal a new approach to hardware design in the large-hardware-design-model era, thereby fostering a more diversified and productive approach to hardware design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08473v1-abstract-full').style.display = 'none'; document.getElementById('2407.08473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICCAD 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06549">arXiv:2407.06549</a> <span> [<a href="https://arxiv.org/pdf/2407.06549">pdf</a>, <a href="https://arxiv.org/format/2407.06549">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AutoTask: Task Aware Multi-Faceted Single Model for Multi-Task Ads Relevance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shouchang Guo</a>, <a href="/search/cs?searchtype=author&query=Damani%2C+S">Sonam Damani</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Keng-hao Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06549v1-abstract-short" style="display: inline;"> Ads relevance models are crucial in determining the relevance between user search queries and ad offers, often framed as a classification problem. The complexity of modeling increases significantly with multiple ad types and varying scenarios that exhibit both similarities and differences. In this work, we introduce a novel multi-faceted attention model that performs task aware feature combination… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06549v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06549v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06549v1-abstract-full" style="display: none;"> Ads relevance models are crucial in determining the relevance between user search queries and ad offers, often framed as a classification problem. The complexity of modeling increases significantly with multiple ad types and varying scenarios that exhibit both similarities and differences. In this work, we introduce a novel multi-faceted attention model that performs task aware feature combination and cross task interaction modeling. Our technique formulates the feature combination problem as "language" modeling with auto-regressive attentions across both feature and task dimensions. Specifically, we introduce a new dimension of task ID encoding for task representations, thereby enabling precise relevance modeling across diverse ad scenarios with substantial improvement in generality capability for unseen tasks. We demonstrate that our model not only effectively handles the increased computational and maintenance demands as scenarios proliferate, but also outperforms generalized DNN models and even task-specific models across a spectrum of ad applications using a single unified model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06549v1-abstract-full').style.display = 'none'; document.getElementById('2407.06549v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02511">arXiv:2407.02511</a> <span> [<a href="https://arxiv.org/pdf/2407.02511">pdf</a>, <a href="https://arxiv.org/format/2407.02511">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM-A*: Large Language Model Enhanced Incremental Heuristic Search on Path Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+S">Silin Meng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng-Fu Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02511v1-abstract-short" style="display: inline;"> Path planning is a fundamental scientific problem in robotics and autonomous navigation, requiring the derivation of efficient routes from starting to destination points while avoiding obstacles. Traditional algorithms like A* and its variants are capable of ensuring path validity but suffer from significant computational and memory inefficiencies as the state space grows. Conversely, large langua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02511v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02511v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02511v1-abstract-full" style="display: none;"> Path planning is a fundamental scientific problem in robotics and autonomous navigation, requiring the derivation of efficient routes from starting to destination points while avoiding obstacles. Traditional algorithms like A* and its variants are capable of ensuring path validity but suffer from significant computational and memory inefficiencies as the state space grows. Conversely, large language models (LLMs) excel in broader environmental analysis through contextual understanding, providing global insights into environments. However, they fall short in detailed spatial and temporal reasoning, often leading to invalid or inefficient routes. In this work, we propose LLM-A*, an new LLM based route planning method that synergistically combines the precise pathfinding capabilities of A* with the global reasoning capability of LLMs. This hybrid approach aims to enhance pathfinding efficiency in terms of time and space complexity while maintaining the integrity of path validity, especially in large-scale scenarios. By integrating the strengths of both methodologies, LLM-A* addresses the computational and memory limitations of conventional algorithms without compromising on the validity required for effective pathfinding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02511v1-abstract-full').style.display = 'none'; document.getElementById('2407.02511v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to The 2024 Conference on Empirical Methods in Natural Language Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02235">arXiv:2407.02235</a> <span> [<a href="https://arxiv.org/pdf/2407.02235">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards a Holistic Framework for Multimodal Large Language Models in Three-dimensional Brain CT Report Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Cheng-Yi Li</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kao-Jung Chang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng-Fu Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hsin-Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenting Chen</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+H">Hritik Bansal</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi-Ping Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu-Chun Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shih-Pin Chen</a>, <a href="/search/cs?searchtype=author&query=Lirng%2C+J">Jiing-Feng Lirng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Chiou%2C+S">Shih-Hwa Chiou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02235v1-abstract-short" style="display: inline;"> Multi-modal large language models (MLLMs) have been given free rein to explore exciting medical applications with a primary focus on radiology report generation. Nevertheless, the preliminary success in 2D radiology captioning is incompetent to reflect the real-world diagnostic challenge in the volumetric 3D anatomy. To mitigate three crucial limitation aspects in the existing literature, includin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02235v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02235v1-abstract-full" style="display: none;"> Multi-modal large language models (MLLMs) have been given free rein to explore exciting medical applications with a primary focus on radiology report generation. Nevertheless, the preliminary success in 2D radiology captioning is incompetent to reflect the real-world diagnostic challenge in the volumetric 3D anatomy. To mitigate three crucial limitation aspects in the existing literature, including (1) data complexity, (2) model capacity, and (3) evaluation metric fidelity, we collected an 18,885 text-scan pairs 3D-BrainCT dataset and applied clinical visual instruction tuning (CVIT) to train BrainGPT models to generate radiology-adherent 3D brain CT reports. Statistically, our BrainGPT scored BLEU-1 = 44.35, BLEU-4 = 20.38, METEOR = 30.13, ROUGE-L = 47.6, and CIDEr-R = 211.77 during internal testing and demonstrated an accuracy of 0.91 in captioning midline shifts on the external validation CQ500 dataset. By further inspecting the captioned report, we reported that the traditional metrics appeared to measure only the surface text similarity and failed to gauge the information density of the diagnostic purpose. To close this gap, we proposed a novel Feature-Oriented Radiology Task Evaluation (FORTE) to estimate the report's clinical relevance (lesion feature and landmarks). Notably, the BrainGPT model scored an average FORTE F1-score of 0.71 (degree=0.661; landmark=0.706; feature=0.693; impression=0.779). To demonstrate that BrainGPT models possess objective readiness to generate human-like radiology reports, we conducted a Turing test that enrolled 11 physician evaluators, and around 74% of the BrainGPT-generated captions were indistinguishable from those written by humans. Our work embodies a holistic framework that showcased the first-hand experience of curating a 3D brain CT dataset, fine-tuning anatomy-sensible language models, and proposing robust radiology evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02235v1-abstract-full').style.display = 'none'; document.getElementById('2407.02235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 figures, 5 supplementary figures, 8 supplementary tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00377">arXiv:2407.00377</a> <span> [<a href="https://arxiv.org/pdf/2407.00377">pdf</a>, <a href="https://arxiv.org/format/2407.00377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> The Factuality Tax of Diversity-Intervened Text-to-Image Generation: Benchmark and Fact-Augmented Intervention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wan%2C+Y">Yixin Wan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoran Wang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00377v2-abstract-short" style="display: inline;"> Prompt-based "diversity interventions" are commonly adopted to improve the diversity of Text-to-Image (T2I) models depicting individuals with various racial or gender traits. However, will this strategy result in nonfactual demographic distribution, especially when generating real historical figures. In this work, we propose DemOgraphic FActualIty Representation (DoFaiR), a benchmark to systematic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00377v2-abstract-full').style.display = 'inline'; document.getElementById('2407.00377v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00377v2-abstract-full" style="display: none;"> Prompt-based "diversity interventions" are commonly adopted to improve the diversity of Text-to-Image (T2I) models depicting individuals with various racial or gender traits. However, will this strategy result in nonfactual demographic distribution, especially when generating real historical figures. In this work, we propose DemOgraphic FActualIty Representation (DoFaiR), a benchmark to systematically quantify the trade-off between using diversity interventions and preserving demographic factuality in T2I models. DoFaiR consists of 756 meticulously fact-checked test instances to reveal the factuality tax of various diversity prompts through an automated evidence-supported evaluation pipeline. Experiments on DoFaiR unveil that diversity-oriented instructions increase the number of different gender and racial groups in DALLE-3's generations at the cost of historically inaccurate demographic distributions. To resolve this issue, we propose Fact-Augmented Intervention (FAI), which instructs a Large Language Model (LLM) to reflect on verbalized or retrieved factual information about gender and racial compositions of generation subjects in history, and incorporate it into the generation context of T2I models. By orienting model generations using the reflected historical truths, FAI significantly improves the demographic factuality under diversity interventions while preserving diversity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00377v2-abstract-full').style.display = 'none'; document.getElementById('2407.00377v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00191">arXiv:2407.00191</a> <span> [<a href="https://arxiv.org/pdf/2407.00191">pdf</a>, <a href="https://arxiv.org/format/2407.00191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MetaKP: On-Demand Keyphrase Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiaoxian Shen</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00191v2-abstract-short" style="display: inline;"> Traditional keyphrase prediction methods predict a single set of keyphrases per document, failing to cater to the diverse needs of users and downstream applications. To bridge the gap, we introduce on-demand keyphrase generation, a novel paradigm that requires keyphrases that conform to specific high-level goals or intents. For this task, we present MetaKP, a large-scale benchmark comprising four… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00191v2-abstract-full').style.display = 'inline'; document.getElementById('2407.00191v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00191v2-abstract-full" style="display: none;"> Traditional keyphrase prediction methods predict a single set of keyphrases per document, failing to cater to the diverse needs of users and downstream applications. To bridge the gap, we introduce on-demand keyphrase generation, a novel paradigm that requires keyphrases that conform to specific high-level goals or intents. For this task, we present MetaKP, a large-scale benchmark comprising four datasets, 7500 documents, and 3760 goals across news and biomedical domains with human-annotated keyphrases. Leveraging MetaKP, we design both supervised and unsupervised methods, including a multi-task fine-tuning approach and a self-consistency prompting method with large language models. The results highlight the challenges of supervised fine-tuning, whose performance is not robust to distribution shifts. By contrast, the proposed self-consistency prompting approach greatly improves the performance of large language models, enabling GPT-4o to achieve 0.548 SemF1, surpassing the performance of a fully fine-tuned BART-base model. Finally, we demonstrate the potential of our method to serve as a general NLP infrastructure, exemplified by its application in epidemic event detection from social media. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00191v2-abstract-full').style.display = 'none'; document.getElementById('2407.00191v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 (Findings)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19486">arXiv:2406.19486</a> <span> [<a href="https://arxiv.org/pdf/2406.19486">pdf</a>, <a href="https://arxiv.org/format/2406.19486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> LoPT: Low-Rank Prompt Tuning for Parameter Efficient Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shouchang Guo</a>, <a href="/search/cs?searchtype=author&query=Damani%2C+S">Sonam Damani</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Keng-hao Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19486v1-abstract-short" style="display: inline;"> In prompt tuning, a prefix or suffix text is added to the prompt, and the embeddings (soft prompts) or token indices (hard prompts) of the prefix/suffix are optimized to gain more control over language models for specific tasks. This approach eliminates the need for hand-crafted prompt engineering or explicit model fine-tuning. Prompt tuning is significantly more parameter-efficient than model fin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19486v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19486v1-abstract-full" style="display: none;"> In prompt tuning, a prefix or suffix text is added to the prompt, and the embeddings (soft prompts) or token indices (hard prompts) of the prefix/suffix are optimized to gain more control over language models for specific tasks. This approach eliminates the need for hand-crafted prompt engineering or explicit model fine-tuning. Prompt tuning is significantly more parameter-efficient than model fine-tuning, as it involves optimizing partial inputs of language models to produce desired outputs. In this work, we aim to further reduce the amount of trainable parameters required for a language model to perform well on specific tasks. We propose Low-rank Prompt Tuning (LoPT), a low-rank model for prompts that achieves efficient prompt optimization. The proposed method demonstrates similar outcomes to full parameter prompt tuning while reducing the number of trainable parameters by a factor of 5. It also provides promising results compared to the state-of-the-art methods that would require 10 to 20 times more parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19486v1-abstract-full').style.display = 'none'; document.getElementById('2406.19486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15178">arXiv:2406.15178</a> <span> [<a href="https://arxiv.org/pdf/2406.15178">pdf</a>, <a href="https://arxiv.org/format/2406.15178">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Alignment Training for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hang Zhou</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kaiyan Chang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bei Li</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yongyu Mu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongran Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15178v1-abstract-short" style="display: inline;"> Alignment training is crucial for enabling large language models (LLMs) to cater to human intentions and preferences. It is typically performed based on two stages with different objectives: instruction-following alignment and human-preference alignment. However, aligning LLMs with these objectives in sequence suffers from an inherent problem: the objectives may conflict, and the LLMs cannot guara… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15178v1-abstract-full').style.display = 'inline'; document.getElementById('2406.15178v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15178v1-abstract-full" style="display: none;"> Alignment training is crucial for enabling large language models (LLMs) to cater to human intentions and preferences. It is typically performed based on two stages with different objectives: instruction-following alignment and human-preference alignment. However, aligning LLMs with these objectives in sequence suffers from an inherent problem: the objectives may conflict, and the LLMs cannot guarantee to simultaneously align with the instructions and human preferences well. To response to these, in this work, we propose a Hybrid Alignment Training (Hbat) approach, based on alternating alignment and modified elastic weight consolidation methods. The basic idea is to alternate between different objectives during alignment training, so that better collaboration can be achieved between the two alignment tasks.We experiment with Hbat on summarization and dialogue tasks. Experimental results show that the proposed \textsc{Hbat} can significantly outperform all baselines. Notably, Hbat yields consistent performance gains over the traditional two-stage alignment training when using both proximal policy optimization and direct preference optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15178v1-abstract-full').style.display = 'none'; document.getElementById('2406.15178v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ACL (Findings) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14137">arXiv:2406.14137</a> <span> [<a href="https://arxiv.org/pdf/2406.14137">pdf</a>, <a href="https://arxiv.org/format/2406.14137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MACAROON: Training Vision-Language Models To Be Your Engaged Partners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shujin Wu</a>, <a href="/search/cs?searchtype=author&query=Fung%2C+Y+R">Yi R. Fung</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sha Li</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Y">Yixin Wan</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+H">Heng Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14137v2-abstract-short" style="display: inline;"> Large vision-language models (LVLMs), while proficient in following instructions and responding to diverse questions, invariably generate detailed responses even when questions are ambiguous or unanswerable, leading to hallucinations and bias issues. Thus, it is essential for LVLMs to proactively engage with humans to ask for clarifications or additional information for better responses. In this s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14137v2-abstract-full').style.display = 'inline'; document.getElementById('2406.14137v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14137v2-abstract-full" style="display: none;"> Large vision-language models (LVLMs), while proficient in following instructions and responding to diverse questions, invariably generate detailed responses even when questions are ambiguous or unanswerable, leading to hallucinations and bias issues. Thus, it is essential for LVLMs to proactively engage with humans to ask for clarifications or additional information for better responses. In this study, we aim to shift LVLMs from passive answer providers to proactive engaged partners. We begin by establishing a three-tiered hierarchy for questions of invalid, ambiguous, and personalizable nature to measure the proactive engagement capabilities of LVLMs. Utilizing this hierarchy, we create PIE, (ProactIve Engagement Evaluation) through GPT-4o and human annotators, consisting of 853 questions across six distinct, fine-grained question types that are verified by human annotators and accompanied with well-defined metrics. Our evaluations on \benchmark indicate poor performance of existing LVLMs, with the best-performing open-weights model only achieving an Aggregate Align Rate (AAR) of 0.28. In response, we introduce MACAROON, self-iMaginAtion for ContrAstive pReference OptimizatiON, which instructs LVLMs to autonomously generate contrastive response pairs for unlabeled questions given the task description and human-crafted criteria. Then, the self-imagined data is formatted for conditional reinforcement learning. Experimental results show MACAROON effectively improves LVLMs' capabilities to be proactively engaged (0.84 AAR) while maintaining comparable performance on general tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14137v2-abstract-full').style.display = 'none'; document.getElementById('2406.14137v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code will be made public at https://github.com/ShujinWu-0814/MACAROON</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13692">arXiv:2406.13692</a> <span> [<a href="https://arxiv.org/pdf/2406.13692">pdf</a>, <a href="https://arxiv.org/format/2406.13692">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Synchronous Faithfulness Monitoring for Trustworthy Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jia-Chen Gu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+F">Fan Yin</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13692v2-abstract-short" style="display: inline;"> Retrieval-augmented language models (RALMs) have shown strong performance and wide applicability in knowledge-intensive tasks. However, there are significant trustworthiness concerns as RALMs are prone to generating unfaithful outputs, including baseless information or contradictions with the retrieved context. This paper proposes SynCheck, a lightweight monitor that leverages fine-grained decodin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13692v2-abstract-full').style.display = 'inline'; document.getElementById('2406.13692v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13692v2-abstract-full" style="display: none;"> Retrieval-augmented language models (RALMs) have shown strong performance and wide applicability in knowledge-intensive tasks. However, there are significant trustworthiness concerns as RALMs are prone to generating unfaithful outputs, including baseless information or contradictions with the retrieved context. This paper proposes SynCheck, a lightweight monitor that leverages fine-grained decoding dynamics including sequence likelihood, uncertainty quantification, context influence, and semantic alignment to synchronously detect unfaithful sentences. By integrating efficiently measurable and complementary signals, SynCheck enables accurate and immediate feedback and intervention, achieving 0.85 AUROC in detecting faithfulness errors across six long-form retrieval-augmented generation tasks, improving prior best method by 4%. Leveraging SynCheck, we further introduce FOD, a faithfulness-oriented decoding algorithm guided by beam search for long-form retrieval-augmented generation. Empirical results demonstrate that FOD outperforms traditional strategies such as abstention, reranking, or contrastive decoding significantly in terms of faithfulness, achieving over 10% improvement across six datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13692v2-abstract-full').style.display = 'none'; document.getElementById('2406.13692v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13444">arXiv:2406.13444</a> <span> [<a href="https://arxiv.org/pdf/2406.13444">pdf</a>, <a href="https://arxiv.org/format/2406.13444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VDebugger: Harnessing Execution Feedback for Debugging Visual Programs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xueqing Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Songyan Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Te-Lin Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+P">Pan Lu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13444v3-abstract-short" style="display: inline;"> Visual programs are executable code generated by large language models to address visual reasoning problems. They decompose complex questions into multiple reasoning steps and invoke specialized models for each step to solve the problems. However, these programs are prone to logic errors, with our preliminary evaluation showing that 58% of the total errors are caused by program logic errors. Debug… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13444v3-abstract-full').style.display = 'inline'; document.getElementById('2406.13444v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13444v3-abstract-full" style="display: none;"> Visual programs are executable code generated by large language models to address visual reasoning problems. They decompose complex questions into multiple reasoning steps and invoke specialized models for each step to solve the problems. However, these programs are prone to logic errors, with our preliminary evaluation showing that 58% of the total errors are caused by program logic errors. Debugging complex visual programs remains a major bottleneck for visual reasoning. To address this, we introduce VDebugger, a novel critic-refiner framework trained to localize and debug visual programs by tracking execution step by step. VDebugger identifies and corrects program errors leveraging detailed execution feedback, improving interpretability and accuracy. The training data is generated through an automated pipeline that injects errors into correct visual programs using a novel mask-best decoding technique. Evaluations on six datasets demonstrate VDebugger's effectiveness, showing performance improvements of up to 3.2% in downstream task accuracy. Further studies show VDebugger's ability to generalize to unseen tasks, bringing a notable improvement of 2.3% on the unseen COVR task. Code, data and models are made publicly available at https://github.com/shirley-wu/vdebugger/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13444v3-abstract-full').style.display = 'none'; document.getElementById('2406.13444v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12725">arXiv:2406.12725</a> <span> [<a href="https://arxiv.org/pdf/2406.12725">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can Large Language Models Code Like a Linguist?: A Case Study in Low Resource Sound Law Induction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Naik%2C+A">Atharva Naik</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kexun Zhang</a>, <a href="/search/cs?searchtype=author&query=Robinson%2C+N">Nathaniel Robinson</a>, <a href="/search/cs?searchtype=author&query=Mysore%2C+A">Aravind Mysore</a>, <a href="/search/cs?searchtype=author&query=Marr%2C+C">Clayton Marr</a>, <a href="/search/cs?searchtype=author&query=Byrnes%2C+H+S+R">Hong Sng Rebecca Byrnes</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+A">Anna Cai</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kalvin Chang</a>, <a href="/search/cs?searchtype=author&query=Mortensen%2C+D">David Mortensen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12725v1-abstract-short" style="display: inline;"> Historical linguists have long written a kind of incompletely formalized ''program'' that converts reconstructed words in an ancestor language into words in one of its attested descendants that consist of a series of ordered string rewrite functions (called sound laws). They do this by observing pairs of words in the reconstructed language (protoforms) and the descendent language (reflexes) and co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12725v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12725v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12725v1-abstract-full" style="display: none;"> Historical linguists have long written a kind of incompletely formalized ''program'' that converts reconstructed words in an ancestor language into words in one of its attested descendants that consist of a series of ordered string rewrite functions (called sound laws). They do this by observing pairs of words in the reconstructed language (protoforms) and the descendent language (reflexes) and constructing a program that transforms protoforms into reflexes. However, writing these programs is error-prone and time-consuming. Prior work has successfully scaffolded this process computationally, but fewer researchers have tackled Sound Law Induction (SLI), which we approach in this paper by casting it as Programming by Examples. We propose a language-agnostic solution that utilizes the programming ability of Large Language Models (LLMs) by generating Python sound law programs from sound change examples. We evaluate the effectiveness of our approach for various LLMs, propose effective methods to generate additional language-agnostic synthetic data to fine-tune LLMs for SLI, and compare our method with existing automated SLI methods showing that while LLMs lag behind them they can complement some of their weaknesses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12725v1-abstract-full').style.display = 'none'; document.getElementById('2406.12725v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10746">arXiv:2406.10746</a> <span> [<a href="https://arxiv.org/pdf/2406.10746">pdf</a>, <a href="https://arxiv.org/format/2406.10746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> SparseCL: Sparse Contrastive Learning for Contradiction Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haike Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yizhou Sun</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Indyk%2C+P">Piotr Indyk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10746v1-abstract-short" style="display: inline;"> Contradiction retrieval refers to identifying and extracting documents that explicitly disagree with or refute the content of a query, which is important to many downstream applications like fact checking and data cleaning. To retrieve contradiction argument to the query from large document corpora, existing methods such as similarity search and crossencoder models exhibit significant limitations.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10746v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10746v1-abstract-full" style="display: none;"> Contradiction retrieval refers to identifying and extracting documents that explicitly disagree with or refute the content of a query, which is important to many downstream applications like fact checking and data cleaning. To retrieve contradiction argument to the query from large document corpora, existing methods such as similarity search and crossencoder models exhibit significant limitations. The former struggles to capture the essence of contradiction due to its inherent nature of favoring similarity, while the latter suffers from computational inefficiency, especially when the size of corpora is large. To address these challenges, we introduce a novel approach: SparseCL that leverages specially trained sentence embeddings designed to preserve subtle, contradictory nuances between sentences. Our method utilizes a combined metric of cosine similarity and a sparsity function to efficiently identify and retrieve documents that contradict a given query. This approach dramatically enhances the speed of contradiction detection by reducing the need for exhaustive document comparisons to simple vector calculations. We validate our model using the Arguana dataset, a benchmark dataset specifically geared towards contradiction retrieval, as well as synthetic contradictions generated from the MSMARCO and HotpotQA datasets using GPT-4. Our experiments demonstrate the efficacy of our approach not only in contradiction retrieval with more than 30% accuracy improvements on MSMARCO and HotpotQA across different model architectures but also in applications such as cleaning corrupted corpora to restore high-quality QA retrieval. This paper outlines a promising direction for improving the accuracy and efficiency of contradiction retrieval in large-scale text corpora. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10746v1-abstract-full').style.display = 'none'; document.getElementById('2406.10746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09411">arXiv:2406.09411</a> <span> [<a href="https://arxiv.org/pdf/2406.09411">pdf</a>, <a href="https://arxiv.org/format/2406.09411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MuirBench: A Comprehensive Benchmark for Robust Multi-image Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xingyu Fu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J+Y">James Y. Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zekun Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qin Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaogeng Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M+D">Mingyu Derek Ma</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Nan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+T+L">Tianyi Lorena Yan</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+W+J">Wenjie Jacky Mo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hsiang-Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+P">Pan Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunyuan Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chaowei Xiao</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Roth%2C+D">Dan Roth</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Poon%2C+H">Hoifung Poon</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Muhao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09411v2-abstract-short" style="display: inline;"> We introduce MuirBench, a comprehensive benchmark that focuses on robust multi-image understanding capabilities of multimodal LLMs. MuirBench consists of 12 diverse multi-image tasks (e.g., scene understanding, ordering) that involve 10 categories of multi-image relations (e.g., multiview, temporal relations). Comprising 11,264 images and 2,600 multiple-choice questions, MuirBench is created in a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09411v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09411v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09411v2-abstract-full" style="display: none;"> We introduce MuirBench, a comprehensive benchmark that focuses on robust multi-image understanding capabilities of multimodal LLMs. MuirBench consists of 12 diverse multi-image tasks (e.g., scene understanding, ordering) that involve 10 categories of multi-image relations (e.g., multiview, temporal relations). Comprising 11,264 images and 2,600 multiple-choice questions, MuirBench is created in a pairwise manner, where each standard instance is paired with an unanswerable variant that has minimal semantic differences, in order for a reliable assessment. Evaluated upon 20 recent multi-modal LLMs, our results reveal that even the best-performing models like GPT-4o and Gemini Pro find it challenging to solve MuirBench, achieving 68.0% and 49.3% in accuracy. Open-source multimodal LLMs trained on single images can hardly generalize to multi-image questions, hovering below 33.3% in accuracy. These results highlight the importance of MuirBench in encouraging the community to develop multimodal LLMs that can look beyond a single image, suggesting potential pathways for future improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09411v2-abstract-full').style.display = 'none'; document.getElementById('2406.09411v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">typos corrected, references added, Project Page: https://muirbench.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05755">arXiv:2406.05755</a> <span> [<a href="https://arxiv.org/pdf/2406.05755">pdf</a>, <a href="https://arxiv.org/format/2406.05755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A DeNoising FPN With Transformer R-CNN for Tiny Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hou-I Liu</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+Y">Yu-Wen Tseng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pin-Jyun Wang</a>, <a href="/search/cs?searchtype=author&query=Shuai%2C+H">Hong-Han Shuai</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wen-Huang Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05755v3-abstract-short" style="display: inline;"> Despite notable advancements in the field of computer vision, the precise detection of tiny objects continues to pose a significant challenge, largely owing to the minuscule pixel representation allocated to these objects in imagery data. This challenge resonates profoundly in the domain of geoscience and remote sensing, where high-fidelity detection of tiny objects can facilitate a myriad of appl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05755v3-abstract-full').style.display = 'inline'; document.getElementById('2406.05755v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05755v3-abstract-full" style="display: none;"> Despite notable advancements in the field of computer vision, the precise detection of tiny objects continues to pose a significant challenge, largely owing to the minuscule pixel representation allocated to these objects in imagery data. This challenge resonates profoundly in the domain of geoscience and remote sensing, where high-fidelity detection of tiny objects can facilitate a myriad of applications ranging from urban planning to environmental monitoring. In this paper, we propose a new framework, namely, DeNoising FPN with Trans R-CNN (DNTR), to improve the performance of tiny object detection. DNTR consists of an easy plug-in design, DeNoising FPN (DN-FPN), and an effective Transformer-based detector, Trans R-CNN. Specifically, feature fusion in the feature pyramid network is important for detecting multiscale objects. However, noisy features may be produced during the fusion process since there is no regularization between the features of different scales. Therefore, we introduce a DN-FPN module that utilizes contrastive learning to suppress noise in each level's features in the top-down path of FPN. Second, based on the two-stage framework, we replace the obsolete R-CNN detector with a novel Trans R-CNN detector to focus on the representation of tiny objects with self-attention. Experimental results manifest that our DNTR outperforms the baselines by at least 17.4% in terms of APvt on the AI-TOD dataset and 9.6% in terms of AP on the VisDrone dataset, respectively. Our code will be available at https://github.com/hoiliu-0801/DNTR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05755v3-abstract-full').style.display = 'none'; document.getElementById('2406.05755v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The article is accepted by IEEE Transactions on Geoscience and Remote Sensing. Our code will be available at https://github.com/hoiliu-0801/DNTR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05003">arXiv:2406.05003</a> <span> [<a href="https://arxiv.org/pdf/2406.05003">pdf</a>, <a href="https://arxiv.org/format/2406.05003">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Designs for Enabling Collaboration in Human-Machine Teaming via Interactive and Explainable Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Paleja%2C+R">Rohan Paleja</a>, <a href="/search/cs?searchtype=author&query=Munje%2C+M">Michael Munje</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kimberlee Chang</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+R">Reed Jensen</a>, <a href="/search/cs?searchtype=author&query=Gombolay%2C+M">Matthew Gombolay</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05003v2-abstract-short" style="display: inline;"> Collaborative robots and machine learning-based virtual agents are increasingly entering the human workspace with the aim of increasing productivity and enhancing safety. Despite this, we show in a ubiquitous experimental domain, Overcooked-AI, that state-of-the-art techniques for human-machine teaming (HMT), which rely on imitation or reinforcement learning, are brittle and result in a machine ag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05003v2-abstract-full').style.display = 'inline'; document.getElementById('2406.05003v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05003v2-abstract-full" style="display: none;"> Collaborative robots and machine learning-based virtual agents are increasingly entering the human workspace with the aim of increasing productivity and enhancing safety. Despite this, we show in a ubiquitous experimental domain, Overcooked-AI, that state-of-the-art techniques for human-machine teaming (HMT), which rely on imitation or reinforcement learning, are brittle and result in a machine agent that aims to decouple the machine and human's actions to act independently rather than in a synergistic fashion. To remedy this deficiency, we develop HMT approaches that enable iterative, mixed-initiative team development allowing end-users to interactively reprogram interpretable AI teammates. Our 50-subject study provides several findings that we summarize into guidelines. While all approaches underperform a simple collaborative heuristic (a critical, negative result for learning-based methods), we find that white-box approaches supported by interactive modification can lead to significant team development, outperforming white-box approaches alone, and that black-box approaches are easier to train and result in better HMT performance highlighting a tradeoff between explainability and interactivity versus ease-of-training. Together, these findings present three important future research directions: 1) Improving the ability to generate collaborative agents with white-box models, 2) Better learning methods to facilitate collaboration rather than individualized coordination, and 3) Mixed-initiative interfaces that enable users, who may vary in ability, to improve collaboration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05003v2-abstract-full').style.display = 'none'; document.getElementById('2406.05003v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chang%2C+K&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chang%2C+K&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+K&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+K&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+K&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+K&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository