Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 157 results for author: <span class="mathjax">Chung, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chung%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chung, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chung%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chung, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chung%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chung%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chung%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chung%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chung%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15602">arXiv:2502.15602</a> <span> [<a href="https://arxiv.org/pdf/2502.15602">pdf</a>, <a href="https://arxiv.org/format/2502.15602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> KAD: No More FAD! An Effective and Efficient Evaluation Metric for Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yoonjin Chung</a>, <a href="/search/cs?searchtype=author&query=Eu%2C+P">Pilsun Eu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Junwon Lee</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+K">Keunwoo Choi</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+J">Juhan Nam</a>, <a href="/search/cs?searchtype=author&query=Chon%2C+B+S">Ben Sangbae Chon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15602v1-abstract-short" style="display: inline;"> Although being widely adopted for evaluating generated audio signals, the Fr茅chet Audio Distance (FAD) suffers from significant limitations, including reliance on Gaussian assumptions, sensitivity to sample size, and high computational complexity. As an alternative, we introduce the Kernel Audio Distance (KAD), a novel, distribution-free, unbiased, and computationally efficient metric based on Max… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15602v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15602v1-abstract-full" style="display: none;"> Although being widely adopted for evaluating generated audio signals, the Fr茅chet Audio Distance (FAD) suffers from significant limitations, including reliance on Gaussian assumptions, sensitivity to sample size, and high computational complexity. As an alternative, we introduce the Kernel Audio Distance (KAD), a novel, distribution-free, unbiased, and computationally efficient metric based on Maximum Mean Discrepancy (MMD). Through analysis and empirical validation, we demonstrate KAD's advantages: (1) faster convergence with smaller sample sizes, enabling reliable evaluation with limited data; (2) lower computational cost, with scalable GPU acceleration; and (3) stronger alignment with human perceptual judgments. By leveraging advanced embeddings and characteristic kernels, KAD captures nuanced differences between real and generated audio. Open-sourced in the kadtk toolkit, KAD provides an efficient, reliable, and perceptually aligned benchmark for evaluating generative audio models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15602v1-abstract-full').style.display = 'none'; document.getElementById('2502.15602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15419">arXiv:2502.15419</a> <span> [<a href="https://arxiv.org/pdf/2502.15419">pdf</a>, <a href="https://arxiv.org/format/2502.15419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Beyond Translation: LLM-Based Data Generation for Multilingual Fact-Checking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yi-Ling Chung</a>, <a href="/search/cs?searchtype=author&query=Cobo%2C+A">Aurora Cobo</a>, <a href="/search/cs?searchtype=author&query=Serna%2C+P">Pablo Serna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15419v1-abstract-short" style="display: inline;"> Robust automatic fact-checking systems have the potential to combat online misinformation at scale. However, most existing research primarily focuses on English. In this paper, we introduce MultiSynFact, the first large-scale multilingual fact-checking dataset containing 2.2M claim-source pairs designed to support Spanish, German, English, and other low-resource languages. Our dataset generation p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15419v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15419v1-abstract-full" style="display: none;"> Robust automatic fact-checking systems have the potential to combat online misinformation at scale. However, most existing research primarily focuses on English. In this paper, we introduce MultiSynFact, the first large-scale multilingual fact-checking dataset containing 2.2M claim-source pairs designed to support Spanish, German, English, and other low-resource languages. Our dataset generation pipeline leverages Large Language Models (LLMs), integrating external knowledge from Wikipedia and incorporating rigorous claim validation steps to ensure data quality. We evaluate the effectiveness of MultiSynFact across multiple models and experimental settings. Additionally, we open-source a user-friendly framework to facilitate further research in multilingual fact-checking and dataset generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15419v1-abstract-full').style.display = 'none'; document.getElementById('2502.15419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 1 figure, 18 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11478">arXiv:2502.11478</a> <span> [<a href="https://arxiv.org/pdf/2502.11478">pdf</a>, <a href="https://arxiv.org/format/2502.11478">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TAPS: Throat and Acoustic Paired Speech Dataset for Deep Learning-Based Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yunsik Kim</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yonghun Song</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yoonyoung Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11478v1-abstract-short" style="display: inline;"> In high-noise environments such as factories, subways, and busy streets, capturing clear speech is challenging due to background noise. Throat microphones provide a solution with their noise-suppressing properties, reducing the noise while recording speech. However, a significant limitation remains: high-frequency information is attenuated as sound waves pass through skin and tissue, reducing spee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11478v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11478v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11478v1-abstract-full" style="display: none;"> In high-noise environments such as factories, subways, and busy streets, capturing clear speech is challenging due to background noise. Throat microphones provide a solution with their noise-suppressing properties, reducing the noise while recording speech. However, a significant limitation remains: high-frequency information is attenuated as sound waves pass through skin and tissue, reducing speech clarity. Recent deep learning approaches have shown promise in enhancing throat microphone recordings, but further progress is constrained by the absence of standardized dataset. We introduce a throat and acoustic paired speech dataset (TAPS), a collection of paired utterances recorded from 60 native Korean speakers using throat and acoustic microphones. To demonstrate the TAPS's utility, we tested three baseline deep learning models and identified the mapping-based approach as superior in improving speech quality and restoring content. Additionally, we propose an optimal method to mitigate the signal mismatch between throat and acoustic microphones, ensuring model performance. These results highlight the potential of TAPS to serve as a standardized dataset and advance research in throat microphone-based speech enhancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11478v1-abstract-full').style.display = 'none'; document.getElementById('2502.11478v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04599">arXiv:2502.04599</a> <span> [<a href="https://arxiv.org/pdf/2502.04599">pdf</a>, <a href="https://arxiv.org/format/2502.04599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Fuzzy Linkography: Automatic Graphical Summarization of Creative Activity Traces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Smith%2C+A">Amy Smith</a>, <a href="/search/cs?searchtype=author&query=Anderson%2C+B+R">Barrett R. Anderson</a>, <a href="/search/cs?searchtype=author&query=Otto%2C+J+T">Jasmine Tan Otto</a>, <a href="/search/cs?searchtype=author&query=Karth%2C+I">Isaac Karth</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuqian Sun</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+J+Y">John Joon Young Chung</a>, <a href="/search/cs?searchtype=author&query=Roemmele%2C+M">Melissa Roemmele</a>, <a href="/search/cs?searchtype=author&query=Kreminski%2C+M">Max Kreminski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04599v1-abstract-short" style="display: inline;"> Linkography -- the analysis of links between the design moves that make up an episode of creative ideation or design -- can be used for both visual and quantitative assessment of creative activity traces. Traditional linkography, however, is time-consuming, requiring a human coder to manually annotate both the design moves within an episode and the connections between them. As a result, linkograph… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04599v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04599v1-abstract-full" style="display: none;"> Linkography -- the analysis of links between the design moves that make up an episode of creative ideation or design -- can be used for both visual and quantitative assessment of creative activity traces. Traditional linkography, however, is time-consuming, requiring a human coder to manually annotate both the design moves within an episode and the connections between them. As a result, linkography has not yet been much applied at scale. To address this limitation, we introduce fuzzy linkography: a means of automatically constructing a linkograph from a sequence of recorded design moves via a "fuzzy" computational model of semantic similarity, enabling wider deployment and new applications of linkographic techniques. We apply fuzzy linkography to three markedly different kinds of creative activity traces (text-to-image prompting journeys, LLM-supported ideation sessions, and researcher publication histories) and discuss our findings, as well as strengths, limitations, and potential future applications of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04599v1-abstract-full').style.display = 'none'; document.getElementById('2502.04599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13284">arXiv:2501.13284</a> <span> [<a href="https://arxiv.org/pdf/2501.13284">pdf</a>, <a href="https://arxiv.org/format/2501.13284">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3713435">10.1145/3706598.3713435 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Toyteller: AI-powered Visual Storytelling Through Toy-Playing with Character Symbols </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+J+J+Y">John Joon Young Chung</a>, <a href="/search/cs?searchtype=author&query=Roemmele%2C+M">Melissa Roemmele</a>, <a href="/search/cs?searchtype=author&query=Kreminski%2C+M">Max Kreminski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13284v1-abstract-short" style="display: inline;"> We introduce Toyteller, an AI-powered storytelling system where users generate a mix of story text and visuals by directly manipulating character symbols like they are toy-playing. Anthropomorphized symbol motions can convey rich and nuanced social interactions; Toyteller leverages these motions (1) to let users steer story text generation and (2) as a visual output format that accompanies story t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13284v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13284v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13284v1-abstract-full" style="display: none;"> We introduce Toyteller, an AI-powered storytelling system where users generate a mix of story text and visuals by directly manipulating character symbols like they are toy-playing. Anthropomorphized symbol motions can convey rich and nuanced social interactions; Toyteller leverages these motions (1) to let users steer story text generation and (2) as a visual output format that accompanies story text. We enabled motion-steered text generation and text-steered motion generation by mapping motions and text onto a shared semantic space so that large language models and motion generation models can use it as a translational layer. Technical evaluations showed that Toyteller outperforms a competitive baseline, GPT-4o. Our user study identified that toy-playing helps express intentions difficult to verbalize. However, only motions could not express all user intentions, suggesting combining it with other modalities like language. We discuss the design space of toy-playing interactions and implications for technical HCI research on human-AI interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13284v1-abstract-full').style.display = 'none'; document.getElementById('2501.13284v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CHI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12372">arXiv:2501.12372</a> <span> [<a href="https://arxiv.org/pdf/2501.12372">pdf</a>, <a href="https://arxiv.org/format/2501.12372">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Is Long Context All You Need? Leveraging LLM's Extended Context for NL2SQL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yeounoh Chung</a>, <a href="/search/cs?searchtype=author&query=Kakkar%2C+G+T">Gaurav T. Kakkar</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Y">Yu Gan</a>, <a href="/search/cs?searchtype=author&query=Milne%2C+B">Brenton Milne</a>, <a href="/search/cs?searchtype=author&query=Ozcan%2C+F">Fatma Ozcan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12372v3-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated impressive capabilities across a range of natural language processing tasks. In particular, improvements in reasoning abilities and the expansion of context windows have opened new avenues for leveraging these powerful models. NL2SQL is challenging in that the natural language question is inherently ambiguous, while the SQL generation requires a preci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12372v3-abstract-full').style.display = 'inline'; document.getElementById('2501.12372v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12372v3-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated impressive capabilities across a range of natural language processing tasks. In particular, improvements in reasoning abilities and the expansion of context windows have opened new avenues for leveraging these powerful models. NL2SQL is challenging in that the natural language question is inherently ambiguous, while the SQL generation requires a precise understanding of complex data schema and semantics. One approach to this semantic ambiguous problem is to provide more and sufficient contextual information. In this work, we explore the performance and the latency trade-offs of the extended context window (a.k.a., long context) offered by Google's state-of-the-art LLM (\textit{gemini-1.5-pro}). We study the impact of various contextual information, including column example values, question and SQL query pairs, user-provided hints, SQL documentation, and schema. To the best of our knowledge, this is the first work to study how the extended context window and extra contextual information can help NL2SQL generation with respect to both accuracy and latency cost. We show that long context LLMs are robust and do not get lost in the extended contextual information. Additionally, our long-context NL2SQL pipeline based on Google's \textit{gemini-pro-1.5} achieve strong performances on various benchmark datasets without finetuning and expensive self-consistency based techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12372v3-abstract-full').style.display = 'none'; document.getElementById('2501.12372v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09099">arXiv:2501.09099</a> <span> [<a href="https://arxiv.org/pdf/2501.09099">pdf</a>, <a href="https://arxiv.org/format/2501.09099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Drama Llama: An LLM-Powered Storylets Framework for Authorable Responsiveness in Interactive Narrative </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuqian Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P+J">Phoebe J. Wang</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+J+Y">John Joon Young Chung</a>, <a href="/search/cs?searchtype=author&query=Roemmele%2C+M">Melissa Roemmele</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taewook Kim</a>, <a href="/search/cs?searchtype=author&query=Kreminski%2C+M">Max Kreminski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09099v1-abstract-short" style="display: inline;"> In this paper, we present Drama Llama, an LLM-powered storylets framework that supports the authoring of responsive, open-ended interactive stories. DL combines the structural benefits of storylet-based systems with the generative capabilities of large language models, enabling authors to create responsive interactive narratives while maintaining narrative control. Rather than crafting complex log… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09099v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09099v1-abstract-full" style="display: none;"> In this paper, we present Drama Llama, an LLM-powered storylets framework that supports the authoring of responsive, open-ended interactive stories. DL combines the structural benefits of storylet-based systems with the generative capabilities of large language models, enabling authors to create responsive interactive narratives while maintaining narrative control. Rather than crafting complex logical preconditions in a general-purpose or domain-specific programming language, authors define triggers in natural language that fire at appropriate moments in the story. Through a preliminary authoring study with six content authors, we present initial evidence that DL can generate coherent and meaningful narratives with believable character interactions. This work suggests directions for hybrid approaches that enhance authorial control while supporting emergent narrative generation through LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09099v1-abstract-full').style.display = 'none'; document.getElementById('2501.09099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 photos</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06488">arXiv:2501.06488</a> <span> [<a href="https://arxiv.org/pdf/2501.06488">pdf</a>, <a href="https://arxiv.org/format/2501.06488">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> NVS-SQA: Exploring Self-Supervised Quality Representation Learning for Neurally Synthesized Scenes without References </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qiang Qu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yiran Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoming Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y+Y">Yuk Ying Chung</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Weidong Cai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06488v1-abstract-short" style="display: inline;"> Neural View Synthesis (NVS), such as NeRF and 3D Gaussian Splatting, effectively creates photorealistic scenes from sparse viewpoints, typically evaluated by quality assessment methods like PSNR, SSIM, and LPIPS. However, these full-reference methods, which compare synthesized views to reference views, may not fully capture the perceptual quality of neurally synthesized scenes (NSS), particularly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06488v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06488v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06488v1-abstract-full" style="display: none;"> Neural View Synthesis (NVS), such as NeRF and 3D Gaussian Splatting, effectively creates photorealistic scenes from sparse viewpoints, typically evaluated by quality assessment methods like PSNR, SSIM, and LPIPS. However, these full-reference methods, which compare synthesized views to reference views, may not fully capture the perceptual quality of neurally synthesized scenes (NSS), particularly due to the limited availability of dense reference views. Furthermore, the challenges in acquiring human perceptual labels hinder the creation of extensive labeled datasets, risking model overfitting and reduced generalizability. To address these issues, we propose NVS-SQA, a NSS quality assessment method to learn no-reference quality representations through self-supervision without reliance on human labels. Traditional self-supervised learning predominantly relies on the "same instance, similar representation" assumption and extensive datasets. However, given that these conditions do not apply in NSS quality assessment, we employ heuristic cues and quality scores as learning objectives, along with a specialized contrastive pair preparation process to improve the effectiveness and efficiency of learning. The results show that NVS-SQA outperforms 17 no-reference methods by a large margin (i.e., on average 109.5% in SRCC, 98.6% in PLCC, and 91.5% in KRCC over the second best) and even exceeds 16 full-reference methods across all evaluation metrics (i.e., 22.9% in SRCC, 19.1% in PLCC, and 18.6% in KRCC over the second best). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06488v1-abstract-full').style.display = 'none'; document.getElementById('2501.06488v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08029">arXiv:2412.08029</a> <span> [<a href="https://arxiv.org/pdf/2412.08029">pdf</a>, <a href="https://arxiv.org/format/2412.08029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TVCG.2024.3372037">10.1109/TVCG.2024.3372037 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> NeRF-NQA: No-Reference Quality Assessment for Scenes Generated by NeRF and Neural View Synthesis Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qiang Qu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hanxue Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoming Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y+Y">Yuk Ying Chung</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yiran Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08029v1-abstract-short" style="display: inline;"> Neural View Synthesis (NVS) has demonstrated efficacy in generating high-fidelity dense viewpoint videos using a image set with sparse views. However, existing quality assessment methods like PSNR, SSIM, and LPIPS are not tailored for the scenes with dense viewpoints synthesized by NVS and NeRF variants, thus, they often fall short in capturing the perceptual quality, including spatial and angular… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08029v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08029v1-abstract-full" style="display: none;"> Neural View Synthesis (NVS) has demonstrated efficacy in generating high-fidelity dense viewpoint videos using a image set with sparse views. However, existing quality assessment methods like PSNR, SSIM, and LPIPS are not tailored for the scenes with dense viewpoints synthesized by NVS and NeRF variants, thus, they often fall short in capturing the perceptual quality, including spatial and angular aspects of NVS-synthesized scenes. Furthermore, the lack of dense ground truth views makes the full reference quality assessment on NVS-synthesized scenes challenging. For instance, datasets such as LLFF provide only sparse images, insufficient for complete full-reference assessments. To address the issues above, we propose NeRF-NQA, the first no-reference quality assessment method for densely-observed scenes synthesized from the NVS and NeRF variants. NeRF-NQA employs a joint quality assessment strategy, integrating both viewwise and pointwise approaches, to evaluate the quality of NVS-generated scenes. The viewwise approach assesses the spatial quality of each individual synthesized view and the overall inter-views consistency, while the pointwise approach focuses on the angular qualities of scene surface points and their compound inter-point quality. Extensive evaluations are conducted to compare NeRF-NQA with 23 mainstream visual quality assessment methods (from fields of image, video, and light-field assessment). The results demonstrate NeRF-NQA outperforms the existing assessment methods significantly and it shows substantial superiority on assessing NVS-synthesized scenes without references. An implementation of this paper are available at https://github.com/VincentQQu/NeRF-NQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08029v1-abstract-full').style.display = 'none'; document.getElementById('2412.08029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Visualization and Computer Graphics, vol. 30, no. 5, pp. 2129-2139, May 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07080">arXiv:2412.07080</a> <span> [<a href="https://arxiv.org/pdf/2412.07080">pdf</a>, <a href="https://arxiv.org/format/2412.07080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIP.2024.3497795">10.1109/TIP.2024.3497795 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> EvRepSL: Event-Stream Representation via Self-Supervised Learning for Event-Based Vision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qiang Qu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoming Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y+Y">Yuk Ying Chung</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yiran Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07080v1-abstract-short" style="display: inline;"> Event-stream representation is the first step for many computer vision tasks using event cameras. It converts the asynchronous event-streams into a formatted structure so that conventional machine learning models can be applied easily. However, most of the state-of-the-art event-stream representations are manually designed and the quality of these representations cannot be guaranteed due to the no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07080v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07080v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07080v1-abstract-full" style="display: none;"> Event-stream representation is the first step for many computer vision tasks using event cameras. It converts the asynchronous event-streams into a formatted structure so that conventional machine learning models can be applied easily. However, most of the state-of-the-art event-stream representations are manually designed and the quality of these representations cannot be guaranteed due to the noisy nature of event-streams. In this paper, we introduce a data-driven approach aiming at enhancing the quality of event-stream representations. Our approach commences with the introduction of a new event-stream representation based on spatial-temporal statistics, denoted as EvRep. Subsequently, we theoretically derive the intrinsic relationship between asynchronous event-streams and synchronous video frames. Building upon this theoretical relationship, we train a representation generator, RepGen, in a self-supervised learning manner accepting EvRep as input. Finally, the event-streams are converted to high-quality representations, termed as EvRepSL, by going through the learned RepGen (without the need of fine-tuning or retraining). Our methodology is rigorously validated through extensive evaluations on a variety of mainstream event-based classification and optical flow datasets (captured with various types of event cameras). The experimental results highlight not only our approach's superior performance over existing event-stream representations but also its versatility, being agnostic to different event cameras and tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07080v1-abstract-full').style.display = 'none'; document.getElementById('2412.07080v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published on IEEE Transactions on Image Processing</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Image Processing, vol. 33, pp. 6579-6591, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16750">arXiv:2411.16750</a> <span> [<a href="https://arxiv.org/pdf/2411.16750">pdf</a>, <a href="https://arxiv.org/format/2411.16750">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> PriorDiffusion: Leverage Language Prior in Diffusion Models for Monocular Depth Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Ziyao Zeng</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jingcheng Ni</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Daniel Wang</a>, <a href="/search/cs?searchtype=author&query=Rim%2C+P">Patrick Rim</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Younjoon Chung</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fengyu Yang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+B">Byung-Woo Hong</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+A">Alex Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16750v1-abstract-short" style="display: inline;"> This paper explores the potential of leveraging language priors learned by text-to-image diffusion models to address ambiguity and visual nuisance in monocular depth estimation. Particularly, traditional monocular depth estimation suffers from inherent ambiguity due to the absence of stereo or multi-view depth cues, and nuisance due to lack of robustness of vision. We argue that language prior in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16750v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16750v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16750v1-abstract-full" style="display: none;"> This paper explores the potential of leveraging language priors learned by text-to-image diffusion models to address ambiguity and visual nuisance in monocular depth estimation. Particularly, traditional monocular depth estimation suffers from inherent ambiguity due to the absence of stereo or multi-view depth cues, and nuisance due to lack of robustness of vision. We argue that language prior in diffusion models can enhance monocular depth estimation by leveraging the geometric prior aligned with the language description, which is learned during text-to-image pre-training. To generate images that reflect the text properly, the model must comprehend the size and shape of specified objects, their spatial relationship, and the scale of the scene. Thus, we propose PriorDiffusion, using a pre-trained text-to-image diffusion model that takes both image and text description that aligned with the scene to infer affine-invariant depth through a denoising process. We also show that language priors can guide the model's attention to specific regions and help it perceive the 3D scene in alignment with user intent. Simultaneously, it acts as a constraint to accelerate the convergence of the diffusion trajectory, since learning 3D properties from a condensed, low-dimensional language feature is more efficient compared with learning from a redundant, high-dimensional image feature. By training on HyperSim and Virtual KITTI, we achieve state-of-the-art zero-shot performance and a faster convergence speed, compared with other diffusion-based depth estimators, across NYUv2, KITTI, ETH3D, and ScanNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16750v1-abstract-full').style.display = 'none'; document.getElementById('2411.16750v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12440">arXiv:2411.12440</a> <span> [<a href="https://arxiv.org/pdf/2411.12440">pdf</a>, <a href="https://arxiv.org/format/2411.12440">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Gaussians: Fast and High-Fidelity 3D Splatting with Linear Kernels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haodong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Runnan Chen</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qiang Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaoqing Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoming Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y+Y">Yuk Ying Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12440v3-abstract-short" style="display: inline;"> Recent advancements in 3D Gaussian Splatting (3DGS) have substantially improved novel view synthesis, enabling high-quality reconstruction and real-time rendering. However, blurring artifacts, such as floating primitives and over-reconstruction, remain challenging. Current methods address these issues by refining scene structure, enhancing geometric representations, addressing blur in training ima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12440v3-abstract-full').style.display = 'inline'; document.getElementById('2411.12440v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12440v3-abstract-full" style="display: none;"> Recent advancements in 3D Gaussian Splatting (3DGS) have substantially improved novel view synthesis, enabling high-quality reconstruction and real-time rendering. However, blurring artifacts, such as floating primitives and over-reconstruction, remain challenging. Current methods address these issues by refining scene structure, enhancing geometric representations, addressing blur in training images, improving rendering consistency, and optimizing density control, yet the role of kernel design remains underexplored. We identify the soft boundaries of Gaussian ellipsoids as one of the causes of these artifacts, limiting detail capture in high-frequency regions. To bridge this gap, we introduce 3D Linear Splatting (3DLS), which replaces Gaussian kernels with linear kernels to achieve sharper and more precise results, particularly in high-frequency regions. Through evaluations on three datasets, 3DLS demonstrates state-of-the-art fidelity and accuracy, along with a 30% FPS improvement over baseline 3DGS. The implementation will be made publicly available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12440v3-abstract-full').style.display = 'none'; document.getElementById('2411.12440v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09072">arXiv:2411.09072</a> <span> [<a href="https://arxiv.org/pdf/2411.09072">pdf</a>, <a href="https://arxiv.org/format/2411.09072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Continuous GNN-based Anomaly Detection on Edge using Efficient Adaptive Knowledge Graph Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yun%2C+S">Sanggeon Yun</a>, <a href="/search/cs?searchtype=author&query=Masukawa%2C+R">Ryozo Masukawa</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+W+Y">William Youngwoo Chung</a>, <a href="/search/cs?searchtype=author&query=Na%2C+M">Minhyoung Na</a>, <a href="/search/cs?searchtype=author&query=Bastian%2C+N">Nathaniel Bastian</a>, <a href="/search/cs?searchtype=author&query=Imani%2C+M">Mohsen Imani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09072v2-abstract-short" style="display: inline;"> The increasing demand for robust security solutions across various industries has made Video Anomaly Detection (VAD) a critical task in applications such as intelligent surveillance, evidence investigation, and violence detection. Traditional approaches to VAD often rely on finetuning large pre-trained models, which can be computationally expensive and impractical for real-time or resource-constra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09072v2-abstract-full').style.display = 'inline'; document.getElementById('2411.09072v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09072v2-abstract-full" style="display: none;"> The increasing demand for robust security solutions across various industries has made Video Anomaly Detection (VAD) a critical task in applications such as intelligent surveillance, evidence investigation, and violence detection. Traditional approaches to VAD often rely on finetuning large pre-trained models, which can be computationally expensive and impractical for real-time or resource-constrained environments. To address this, MissionGNN introduced a more efficient method by training a graph neural network (GNN) using a fixed knowledge graph (KG) derived from large language models (LLMs) like GPT-4. While this approach demonstrated significant efficiency in computational power and memory, it faces limitations in dynamic environments where frequent updates to the KG are necessary due to evolving behavior trends and shifting data patterns. These updates typically require cloud-based computation, posing challenges for edge computing applications. In this paper, we propose a novel framework that facilitates continuous KG adaptation directly on edge devices, overcoming the limitations of cloud dependency. Our method dynamically modifies the KG through a three-phase process: pruning, alternating, and creating nodes, enabling real-time adaptation to changing data trends. This continuous learning approach enhances the robustness of anomaly detection models, making them more suitable for deployment in dynamic and resource-constrained environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09072v2-abstract-full').style.display = 'none'; document.getElementById('2411.09072v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to DATE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24177">arXiv:2410.24177</a> <span> [<a href="https://arxiv.org/pdf/2410.24177">pdf</a>, <a href="https://arxiv.org/format/2410.24177">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> DC-Spin: A Speaker-invariant Speech Tokenizer for Spoken Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+H">Heng-Jui Chang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+H">Hongyu Gong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Changhan Wang</a>, <a href="/search/cs?searchtype=author&query=Glass%2C+J">James Glass</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yu-An Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24177v1-abstract-short" style="display: inline;"> Spoken language models (SLMs) have gained increasing attention with advancements in text-based, decoder-only language models. SLMs process text and speech, enabling simultaneous speech understanding and generation. This paper presents Double-Codebook Speaker-invariant Clustering (DC-Spin), which aims to improve speech tokenization by bridging audio signals and SLM tokens. DC-Spin extracts speaker-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24177v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24177v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24177v1-abstract-full" style="display: none;"> Spoken language models (SLMs) have gained increasing attention with advancements in text-based, decoder-only language models. SLMs process text and speech, enabling simultaneous speech understanding and generation. This paper presents Double-Codebook Speaker-invariant Clustering (DC-Spin), which aims to improve speech tokenization by bridging audio signals and SLM tokens. DC-Spin extracts speaker-invariant tokens rich in phonetic information and resilient to input variations, enhancing zero-shot SLM tasks and speech resynthesis. We propose a chunk-wise approach to enable streamable DC-Spin without retraining and degradation. Comparisons of tokenization methods (self-supervised and neural audio codecs), model scalability, and downstream task proxies show that tokens easily modeled by an n-gram LM or aligned with phonemes offer strong performance, providing insights for designing speech tokenizers for SLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24177v1-abstract-full').style.display = 'none'; document.getElementById('2410.24177v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02074">arXiv:2410.02074</a> <span> [<a href="https://arxiv.org/pdf/2410.02074">pdf</a>, <a href="https://arxiv.org/format/2410.02074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Price-guided user attention in large-scale E-commerce group recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yang Shi</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Young-joo Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02074v1-abstract-short" style="display: inline;"> Existing group recommender systems utilize attention mechanisms to identify critical users who influence group decisions the most. We analyzed user attention scores from a widely-used group recommendation model on a real-world E-commerce dataset and found that item price and user interaction history significantly influence the selection of critical users. When item prices are low, users with exten… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02074v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02074v1-abstract-full" style="display: none;"> Existing group recommender systems utilize attention mechanisms to identify critical users who influence group decisions the most. We analyzed user attention scores from a widely-used group recommendation model on a real-world E-commerce dataset and found that item price and user interaction history significantly influence the selection of critical users. When item prices are low, users with extensive interaction histories are more influential in group decision-making. Conversely, their influence diminishes with higher item prices. Based on these observations, we propose a novel group recommendation approach that incorporates item price as a guiding factor for user aggregation. Our model employs an adaptive sigmoid function to adjust output logits based on item prices, enhancing the accuracy of user aggregation. Our model can be plugged into any attention-based group recommender system if the price information is available. We evaluate our model's performance on a public benchmark and a real-world dataset. We compare it with other state-of-the-art group recommendation methods. Our results demonstrate that our price-guided user attention approach outperforms the state-of-the-art methods in terms of hit ratio and mean square error. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02074v1-abstract-full').style.display = 'none'; document.getElementById('2410.02074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01943">arXiv:2410.01943</a> <span> [<a href="https://arxiv.org/pdf/2410.01943">pdf</a>, <a href="https://arxiv.org/format/2410.01943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> CHASE-SQL: Multi-Path Reasoning and Preference Optimized Candidate Selection in Text-to-SQL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pourreza%2C+M">Mohammadreza Pourreza</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hailong Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoxi Sun</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yeounoh Chung</a>, <a href="/search/cs?searchtype=author&query=Talaei%2C+S">Shayan Talaei</a>, <a href="/search/cs?searchtype=author&query=Kakkar%2C+G+T">Gaurav Tarlok Kakkar</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Y">Yu Gan</a>, <a href="/search/cs?searchtype=author&query=Saberi%2C+A">Amin Saberi</a>, <a href="/search/cs?searchtype=author&query=Ozcan%2C+F">Fatma Ozcan</a>, <a href="/search/cs?searchtype=author&query=Arik%2C+S+O">Sercan O. Arik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01943v1-abstract-short" style="display: inline;"> In tackling the challenges of large language model (LLM) performance for Text-to-SQL tasks, we introduce CHASE-SQL, a new framework that employs innovative strategies, using test-time compute in multi-agent modeling to improve candidate generation and selection. CHASE-SQL leverages LLMs' intrinsic knowledge to generate diverse and high-quality SQL candidates using different LLM generators with: (1… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01943v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01943v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01943v1-abstract-full" style="display: none;"> In tackling the challenges of large language model (LLM) performance for Text-to-SQL tasks, we introduce CHASE-SQL, a new framework that employs innovative strategies, using test-time compute in multi-agent modeling to improve candidate generation and selection. CHASE-SQL leverages LLMs' intrinsic knowledge to generate diverse and high-quality SQL candidates using different LLM generators with: (1) a divide-and-conquer method that decomposes complex queries into manageable sub-queries in a single LLM call; (2) chain-of-thought reasoning based on query execution plans, reflecting the steps a database engine takes during execution; and (3) a unique instance-aware synthetic example generation technique, which offers specific few-shot demonstrations tailored to test questions.To identify the best candidate, a selection agent is employed to rank the candidates through pairwise comparisons with a fine-tuned binary-candidates selection LLM. This selection approach has been demonstrated to be more robust over alternatives. The proposed generators-selector framework not only enhances the quality and diversity of SQL queries but also outperforms previous methods. Overall, our proposed CHASE-SQL achieves the state-of-the-art execution accuracy of 73.0% and 73.01% on the test set and development set of the notable BIRD Text-to-SQL dataset benchmark, rendering CHASE-SQL the top submission of the leaderboard (at the time of paper submission). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01943v1-abstract-full').style.display = 'none'; document.getElementById('2410.01943v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00207">arXiv:2410.00207</a> <span> [<a href="https://arxiv.org/pdf/2410.00207">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Evaluating the performance of state-of-the-art esg domain-specific pre-trained large language models in text classification against existing models and traditional machine learning techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+T+Y">Tin Yuet Chung</a>, <a href="/search/cs?searchtype=author&query=Latifi%2C+M">Majid Latifi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00207v1-abstract-short" style="display: inline;"> This research investigates the classification of Environmental, Social, and Governance (ESG) information within textual disclosures. The aim is to develop and evaluate binary classification models capable of accurately identifying and categorizing E, S and G-related content respectively. The motivation for this research stems from the growing importance of ESG considerations in investment decisi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00207v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00207v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00207v1-abstract-full" style="display: none;"> This research investigates the classification of Environmental, Social, and Governance (ESG) information within textual disclosures. The aim is to develop and evaluate binary classification models capable of accurately identifying and categorizing E, S and G-related content respectively. The motivation for this research stems from the growing importance of ESG considerations in investment decisions and corporate accountability. Accurate and efficient classification of ESG information is crucial for stakeholders to understand the impact of companies on sustainability and to make informed decisions. The research uses a quantitative approach involving data collection, data preprocessing, and the development of ESG-focused Large Language Models (LLMs) and traditional machine learning (Support Vector Machines, XGBoost) classifiers. Performance evaluation guides iterative refinement until satisfactory metrics are achieved. The research compares traditional machine learning techniques (Support Vector Machines, XGBoost), state-of-the-art language model (FinBERT-ESG) and fine-tuned LLMs like Llama 2, by employing standard Natural Language Processing performance metrics such as accuracy, precision, recall, F1-score. A novel fine-tuning method, Qlora, is applied to LLMs, resulting in significant performance improvements across all ESG domains. The research also develops domain-specific fine-tuned models, such as EnvLlama 2-Qlora, SocLlama 2-Qlora, and GovLlama 2-Qlora, which demonstrate impressive results in ESG text classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00207v1-abstract-full').style.display = 'none'; document.getElementById('2410.00207v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">56 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15087">arXiv:2409.15087</a> <span> [<a href="https://arxiv.org/pdf/2409.15087">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Accountable AI-Assisted Eye Disease Diagnosis: Workflow Design, External Validation, and Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Keenan%2C+T+D+L">Tiarnan D L Keenan</a>, <a href="/search/cs?searchtype=author&query=Agron%2C+E">Elvira Agron</a>, <a href="/search/cs?searchtype=author&query=Allot%2C+A">Alexis Allot</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+E">Emily Guan</a>, <a href="/search/cs?searchtype=author&query=Duong%2C+B">Bryant Duong</a>, <a href="/search/cs?searchtype=author&query=Elsawy%2C+A">Amr Elsawy</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+B">Benjamin Hou</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+C">Cancan Xue</a>, <a href="/search/cs?searchtype=author&query=Bhandari%2C+S">Sanjeeb Bhandari</a>, <a href="/search/cs?searchtype=author&query=Broadhead%2C+G">Geoffrey Broadhead</a>, <a href="/search/cs?searchtype=author&query=Cousineau-Krieger%2C+C">Chantal Cousineau-Krieger</a>, <a href="/search/cs?searchtype=author&query=Davis%2C+E">Ellen Davis</a>, <a href="/search/cs?searchtype=author&query=Gensheimer%2C+W+G">William G Gensheimer</a>, <a href="/search/cs?searchtype=author&query=Grasic%2C+D">David Grasic</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+S">Seema Gupta</a>, <a href="/search/cs?searchtype=author&query=Haddock%2C+L">Luis Haddock</a>, <a href="/search/cs?searchtype=author&query=Konstantinou%2C+E">Eleni Konstantinou</a>, <a href="/search/cs?searchtype=author&query=Lamba%2C+T">Tania Lamba</a>, <a href="/search/cs?searchtype=author&query=Maiberger%2C+M">Michele Maiberger</a>, <a href="/search/cs?searchtype=author&query=Mantopoulos%2C+D">Dimosthenis Mantopoulos</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+M+C">Mitul C Mehta</a>, <a href="/search/cs?searchtype=author&query=Nahri%2C+A+G">Ayman G Nahri</a>, <a href="/search/cs?searchtype=author&query=AL-Nawaflh%2C+M">Mutaz AL-Nawaflh</a>, <a href="/search/cs?searchtype=author&query=Oshinsky%2C+A">Arnold Oshinsky</a> , et al. (13 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15087v1-abstract-short" style="display: inline;"> Timely disease diagnosis is challenging due to increasing disease burdens and limited clinician availability. AI shows promise in diagnosis accuracy but faces real-world application issues due to insufficient validation in clinical workflows and diverse populations. This study addresses gaps in medical AI downstream accountability through a case study on age-related macular degeneration (AMD) diag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15087v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15087v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15087v1-abstract-full" style="display: none;"> Timely disease diagnosis is challenging due to increasing disease burdens and limited clinician availability. AI shows promise in diagnosis accuracy but faces real-world application issues due to insufficient validation in clinical workflows and diverse populations. This study addresses gaps in medical AI downstream accountability through a case study on age-related macular degeneration (AMD) diagnosis and severity classification. We designed and implemented an AI-assisted diagnostic workflow for AMD, comparing diagnostic performance with and without AI assistance among 24 clinicians from 12 institutions with real patient data sampled from the Age-Related Eye Disease Study (AREDS). Additionally, we demonstrated continual enhancement of an existing AI model by incorporating approximately 40,000 additional medical images (named AREDS2 dataset). The improved model was then systematically evaluated using both AREDS and AREDS2 test sets, as well as an external test set from Singapore. AI assistance markedly enhanced diagnostic accuracy and classification for 23 out of 24 clinicians, with the average F1-score increasing by 20% from 37.71 (Manual) to 45.52 (Manual + AI) (P-value < 0.0001), achieving an improvement of over 50% in some cases. In terms of efficiency, AI assistance reduced diagnostic times for 17 out of the 19 clinicians tracked, with time savings of up to 40%. Furthermore, a model equipped with continual learning showed robust performance across three independent datasets, recording a 29% increase in accuracy, and elevating the F1-score from 42 to 54 in the Singapore population. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15087v1-abstract-full').style.display = 'none'; document.getElementById('2409.15087v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00879">arXiv:2409.00879</a> <span> [<a href="https://arxiv.org/pdf/2409.00879">pdf</a>, <a href="https://arxiv.org/format/2409.00879">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Beyond Parameter Count: Implicit Bias in Soft Mixture of Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Youngseog Chung</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+D">Dhruv Malik</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+J">Jeff Schneider</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuanzhi Li</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Aarti Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00879v1-abstract-short" style="display: inline;"> The traditional viewpoint on Sparse Mixture of Experts (MoE) models is that instead of training a single large expert, which is computationally expensive, we can train many small experts. The hope is that if the total parameter count of the small experts equals that of the singular large expert, then we retain the representation power of the large expert while gaining computational tractability an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00879v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00879v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00879v1-abstract-full" style="display: none;"> The traditional viewpoint on Sparse Mixture of Experts (MoE) models is that instead of training a single large expert, which is computationally expensive, we can train many small experts. The hope is that if the total parameter count of the small experts equals that of the singular large expert, then we retain the representation power of the large expert while gaining computational tractability and promoting expert specialization. The recently introduced Soft MoE replaces the Sparse MoE's discrete routing mechanism with a differentiable gating function that smoothly mixes tokens. While this smooth gating function successfully mitigates the various training instabilities associated with Sparse MoE, it is unclear whether it induces implicit biases that affect Soft MoE's representation power or potential for expert specialization. We prove that Soft MoE with a single arbitrarily powerful expert cannot represent simple convex functions. This justifies that Soft MoE's success cannot be explained by the traditional viewpoint of many small experts collectively mimicking the representation power of a single large expert, and that multiple experts are actually necessary to achieve good representation power (even for a fixed total parameter count). Continuing along this line of investigation, we introduce a notion of expert specialization for Soft MoE, and while varying the number of experts yet fixing the total parameter count, we consider the following (computationally intractable) task. Given any input, how can we discover the expert subset that is specialized to predict this input's label? We empirically show that when there are many small experts, the architecture is implicitly biased in a fashion that allows us to efficiently approximate the specialized expert subset. Our method can be easily implemented to potentially reduce computation during inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00879v1-abstract-full').style.display = 'none'; document.getElementById('2409.00879v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 5 figures, 13 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06731">arXiv:2408.06731</a> <span> [<a href="https://arxiv.org/pdf/2408.06731">pdf</a>, <a href="https://arxiv.org/format/2408.06731">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large language models can consistently generate high-quality content for election disinformation operations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Williams%2C+A+R">Angus R. Williams</a>, <a href="/search/cs?searchtype=author&query=Burke-Moore%2C+L">Liam Burke-Moore</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+R+S">Ryan Sze-Yin Chan</a>, <a href="/search/cs?searchtype=author&query=Enock%2C+F+E">Florence E. Enock</a>, <a href="/search/cs?searchtype=author&query=Nanni%2C+F">Federico Nanni</a>, <a href="/search/cs?searchtype=author&query=Sippy%2C+T">Tvesha Sippy</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yi-Ling Chung</a>, <a href="/search/cs?searchtype=author&query=Gabasova%2C+E">Evelina Gabasova</a>, <a href="/search/cs?searchtype=author&query=Hackenburg%2C+K">Kobi Hackenburg</a>, <a href="/search/cs?searchtype=author&query=Bright%2C+J">Jonathan Bright</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06731v1-abstract-short" style="display: inline;"> Advances in large language models have raised concerns about their potential use in generating compelling election disinformation at scale. This study presents a two-part investigation into the capabilities of LLMs to automate stages of an election disinformation operation. First, we introduce DisElect, a novel evaluation dataset designed to measure LLM compliance with instructions to generate con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06731v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06731v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06731v1-abstract-full" style="display: none;"> Advances in large language models have raised concerns about their potential use in generating compelling election disinformation at scale. This study presents a two-part investigation into the capabilities of LLMs to automate stages of an election disinformation operation. First, we introduce DisElect, a novel evaluation dataset designed to measure LLM compliance with instructions to generate content for an election disinformation operation in localised UK context, containing 2,200 malicious prompts and 50 benign prompts. Using DisElect, we test 13 LLMs and find that most models broadly comply with these requests; we also find that the few models which refuse malicious prompts also refuse benign election-related prompts, and are more likely to refuse to generate content from a right-wing perspective. Secondly, we conduct a series of experiments (N=2,340) to assess the "humanness" of LLMs: the extent to which disinformation operation content generated by an LLM is able to pass as human-written. Our experiments suggest that almost all LLMs tested released since 2022 produce election disinformation operation content indiscernible by human evaluators over 50% of the time. Notably, we observe that multiple models achieve above-human levels of humanness. Taken together, these findings suggest that current LLMs can be used to generate high-quality content for election disinformation operations, even in hyperlocalised scenarios, at far lower costs than traditional methods, and offer researchers and policymakers an empirical benchmark for the measurement and evaluation of these capabilities in current and future models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06731v1-abstract-full').style.display = 'none'; document.getElementById('2408.06731v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04112">arXiv:2408.04112</a> <span> [<a href="https://arxiv.org/pdf/2408.04112">pdf</a>, <a href="https://arxiv.org/format/2408.04112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3654777.3676352">10.1145/3654777.3676352 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Patchview: LLM-Powered Worldbuilding with Generative Dust and Magnet Visualization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+J+J+Y">John Joon Young Chung</a>, <a href="/search/cs?searchtype=author&query=Kreminski%2C+M">Max Kreminski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04112v1-abstract-short" style="display: inline;"> Large language models (LLMs) can help writers build story worlds by generating world elements, such as factions, characters, and locations. However, making sense of many generated elements can be overwhelming. Moreover, if the user wants to precisely control aspects of generated elements that are difficult to specify verbally, prompting alone may be insufficient. We introduce Patchview, a customiz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04112v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04112v1-abstract-full" style="display: none;"> Large language models (LLMs) can help writers build story worlds by generating world elements, such as factions, characters, and locations. However, making sense of many generated elements can be overwhelming. Moreover, if the user wants to precisely control aspects of generated elements that are difficult to specify verbally, prompting alone may be insufficient. We introduce Patchview, a customizable LLM-powered system that visually aids worldbuilding by allowing users to interact with story concepts and elements through the physical metaphor of magnets and dust. Elements in Patchview are visually dragged closer to concepts with high relevance, facilitating sensemaking. The user can also steer the generation with verbally elusive concepts by indicating the desired position of the element between concepts. When the user disagrees with the LLM's visualization and generation, they can correct those by repositioning the element. These corrections can be used to align the LLM's future behaviors to the user's perception. With a user study, we show that Patchview supports the sensemaking of world elements and steering of element generation, facilitating exploration during the worldbuilding process. Patchview provides insights on how customizable visual representation can help sensemake, steer, and align generative AI model behaviors with the user's intentions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04112v1-abstract-full').style.display = 'none'; document.getElementById('2408.04112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to UIST2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15329">arXiv:2407.15329</a> <span> [<a href="https://arxiv.org/pdf/2407.15329">pdf</a>, <a href="https://arxiv.org/format/2407.15329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Multi-disparity Transformer for Light Field Image Super-resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Z+Z">Zeke Zexi Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haodong Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y+Y">Yuk Ying Chung</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoming Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15329v1-abstract-short" style="display: inline;"> This paper presents the Multi-scale Disparity Transformer (MDT), a novel Transformer tailored for light field image super-resolution (LFSR) that addresses the issues of computational redundancy and disparity entanglement caused by the indiscriminate processing of sub-aperture images inherent in conventional methods. MDT features a multi-branch structure, with each branch utilising independent disp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15329v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15329v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15329v1-abstract-full" style="display: none;"> This paper presents the Multi-scale Disparity Transformer (MDT), a novel Transformer tailored for light field image super-resolution (LFSR) that addresses the issues of computational redundancy and disparity entanglement caused by the indiscriminate processing of sub-aperture images inherent in conventional methods. MDT features a multi-branch structure, with each branch utilising independent disparity self-attention (DSA) to target specific disparity ranges, effectively reducing computational complexity and disentangling disparities. Building on this architecture, we present LF-MDTNet, an efficient LFSR network. Experimental results demonstrate that LF-MDTNet outperforms existing state-of-the-art methods by 0.37 dB and 0.41 dB PSNR at the 2x and 4x scales, achieving superior performance with fewer parameters and higher speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15329v1-abstract-full').style.display = 'none'; document.getElementById('2407.15329v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18375">arXiv:2406.18375</a> <span> [<a href="https://arxiv.org/pdf/2406.18375">pdf</a>, <a href="https://arxiv.org/format/2406.18375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From Majority to Minority: A Diffusion-based Augmentation for Underrepresented Groups in Skin Lesion Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Janet Wang</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yunsung Chung</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zhengming Ding</a>, <a href="/search/cs?searchtype=author&query=Hamm%2C+J">Jihun Hamm</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18375v2-abstract-short" style="display: inline;"> AI-based diagnoses have demonstrated dermatologist-level performance in classifying skin cancer. However, such systems are prone to under-performing when tested on data from minority groups that lack sufficient representation in the training sets. Although data collection and annotation offer the best means for promoting minority groups, these processes are costly and time-consuming. Prior works h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18375v2-abstract-full').style.display = 'inline'; document.getElementById('2406.18375v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18375v2-abstract-full" style="display: none;"> AI-based diagnoses have demonstrated dermatologist-level performance in classifying skin cancer. However, such systems are prone to under-performing when tested on data from minority groups that lack sufficient representation in the training sets. Although data collection and annotation offer the best means for promoting minority groups, these processes are costly and time-consuming. Prior works have suggested that data from majority groups may serve as a valuable information source to supplement the training of diagnosis tools for minority groups. In this work, we propose an effective diffusion-based augmentation framework that maximizes the use of rich information from majority groups to benefit minority groups. Using groups with different skin types as a case study, our results show that the proposed framework can generate synthetic images that improve diagnostic results for the minority groups, even when there is little or no reference data from these target groups. The practical value of our work is evident in medical imaging analysis, where under-diagnosis persists as a problem for certain groups due to insufficient representation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18375v2-abstract-full').style.display = 'none'; document.getElementById('2406.18375v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13954">arXiv:2405.13954</a> <span> [<a href="https://arxiv.org/pdf/2405.13954">pdf</a>, <a href="https://arxiv.org/format/2405.13954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> What is Your Data Worth to GPT? LLM-Scale Data Valuation with Influence Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choe%2C+S+K">Sang Keun Choe</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+H">Hwijeen Ahn</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+J">Juhan Bae</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kewen Zhao</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+M">Minsoo Kang</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Youngseog Chung</a>, <a href="/search/cs?searchtype=author&query=Pratapa%2C+A">Adithya Pratapa</a>, <a href="/search/cs?searchtype=author&query=Neiswanger%2C+W">Willie Neiswanger</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Mitamura%2C+T">Teruko Mitamura</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+J">Jeff Schneider</a>, <a href="/search/cs?searchtype=author&query=Hovy%2C+E">Eduard Hovy</a>, <a href="/search/cs?searchtype=author&query=Grosse%2C+R">Roger Grosse</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+E">Eric Xing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13954v1-abstract-short" style="display: inline;"> Large language models (LLMs) are trained on a vast amount of human-written data, but data providers often remain uncredited. In response to this issue, data valuation (or data attribution), which quantifies the contribution or value of each data to the model output, has been discussed as a potential solution. Nevertheless, applying existing data valuation methods to recent LLMs and their vast trai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13954v1-abstract-full').style.display = 'inline'; document.getElementById('2405.13954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13954v1-abstract-full" style="display: none;"> Large language models (LLMs) are trained on a vast amount of human-written data, but data providers often remain uncredited. In response to this issue, data valuation (or data attribution), which quantifies the contribution or value of each data to the model output, has been discussed as a potential solution. Nevertheless, applying existing data valuation methods to recent LLMs and their vast training datasets has been largely limited by prohibitive compute and memory costs. In this work, we focus on influence functions, a popular gradient-based data valuation method, and significantly improve its scalability with an efficient gradient projection strategy called LoGra that leverages the gradient structure in backpropagation. We then provide a theoretical motivation of gradient projection approaches to influence functions to promote trust in the data valuation process. Lastly, we lower the barrier to implementing data valuation systems by introducing LogIX, a software package that can transform existing training code into data valuation code with minimal effort. In our data valuation experiments, LoGra achieves competitive accuracy against more expensive baselines while showing up to 6,500x improvement in throughput and 5x reduction in GPU memory usage when applied to Llama3-8B-Instruct and the 1B-token dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13954v1-abstract-full').style.display = 'none'; document.getElementById('2405.13954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.11703">arXiv:2405.11703</a> <span> [<a href="https://arxiv.org/pdf/2405.11703">pdf</a>, <a href="https://arxiv.org/format/2405.11703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> QComp: A QSAR-Based Data Completion Framework for Drug Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bingjia Yang</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yunsie Chung</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A+Y">Archer Y. Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+B">Bo Yuan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.11703v1-abstract-short" style="display: inline;"> In drug discovery, in vitro and in vivo experiments reveal biochemical activities related to the efficacy and toxicity of compounds. The experimental data accumulate into massive, ever-evolving, and sparse datasets. Quantitative Structure-Activity Relationship (QSAR) models, which predict biochemical activities using only the structural information of compounds, face challenges in integrating the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11703v1-abstract-full').style.display = 'inline'; document.getElementById('2405.11703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.11703v1-abstract-full" style="display: none;"> In drug discovery, in vitro and in vivo experiments reveal biochemical activities related to the efficacy and toxicity of compounds. The experimental data accumulate into massive, ever-evolving, and sparse datasets. Quantitative Structure-Activity Relationship (QSAR) models, which predict biochemical activities using only the structural information of compounds, face challenges in integrating the evolving experimental data as studies progress. We develop QSAR-Complete (QComp), a data completion framework to address this issue. Based on pre-existing QSAR models, QComp utilizes the correlation inherent in experimental data to enhance prediction accuracy across various tasks. Moreover, QComp emerges as a promising tool for guiding the optimal sequence of experiments by quantifying the reduction in statistical uncertainty for specific endpoints, thereby aiding in rational decision-making throughout the drug discovery process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11703v1-abstract-full').style.display = 'none'; document.getElementById('2405.11703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10345">arXiv:2405.10345</a> <span> [<a href="https://arxiv.org/pdf/2405.10345">pdf</a>, <a href="https://arxiv.org/format/2405.10345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Machine Learning Driven Biomarker Selection for Medical Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bavikadi%2C+D">Divyagna Bavikadi</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+A">Ayushi Agarwal</a>, <a href="/search/cs?searchtype=author&query=Ganta%2C+S">Shashank Ganta</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yunro Chung</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Lusheng Song</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+J">Ji Qiu</a>, <a href="/search/cs?searchtype=author&query=Shakarian%2C+P">Paulo Shakarian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10345v1-abstract-short" style="display: inline;"> Recent advances in experimental methods have enabled researchers to collect data on thousands of analytes simultaneously. This has led to correlational studies that associated molecular measurements with diseases such as Alzheimer's, Liver, and Gastric Cancer. However, the use of thousands of biomarkers selected from the analytes is not practical for real-world medical diagnosis and is likely unde… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10345v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10345v1-abstract-full" style="display: none;"> Recent advances in experimental methods have enabled researchers to collect data on thousands of analytes simultaneously. This has led to correlational studies that associated molecular measurements with diseases such as Alzheimer's, Liver, and Gastric Cancer. However, the use of thousands of biomarkers selected from the analytes is not practical for real-world medical diagnosis and is likely undesirable due to potentially formed spurious correlations. In this study, we evaluate 4 different methods for biomarker selection and 4 different machine learning (ML) classifiers for identifying correlations, evaluating 16 approaches in all. We found that contemporary methods outperform previously reported logistic regression in cases where 3 and 10 biomarkers are permitted. When specificity is fixed at 0.9, ML approaches produced a sensitivity of 0.240 (3 biomarkers) and 0.520 (10 biomarkers), while standard logistic regression provided a sensitivity of 0.000 (3 biomarkers) and 0.040 (10 biomarkers). We also noted that causal-based methods for biomarker selection proved to be the most performant when fewer biomarkers were permitted, while univariate feature selection was the most performant when a greater number of biomarkers were permitted. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10345v1-abstract-full').style.display = 'none'; document.getElementById('2405.10345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05581">arXiv:2405.05581</a> <span> [<a href="https://arxiv.org/pdf/2405.05581">pdf</a>, <a href="https://arxiv.org/format/2405.05581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3630106.3662681">10.1145/3630106.3662681 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> One vs. Many: Comprehending Accurate Information from Multiple Erroneous and Inconsistent AI Generations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yoonjoo Lee</a>, <a href="/search/cs?searchtype=author&query=Son%2C+K">Kihoon Son</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T+S">Tae Soo Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jisu Kim</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+J+Y">John Joon Young Chung</a>, <a href="/search/cs?searchtype=author&query=Adar%2C+E">Eytan Adar</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Juho Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05581v1-abstract-short" style="display: inline;"> As Large Language Models (LLMs) are nondeterministic, the same input can generate different outputs, some of which may be incorrect or hallucinated. If run again, the LLM may correct itself and produce the correct answer. Unfortunately, most LLM-powered systems resort to single results which, correct or not, users accept. Having the LLM produce multiple outputs may help identify disagreements or a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05581v1-abstract-full').style.display = 'inline'; document.getElementById('2405.05581v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05581v1-abstract-full" style="display: none;"> As Large Language Models (LLMs) are nondeterministic, the same input can generate different outputs, some of which may be incorrect or hallucinated. If run again, the LLM may correct itself and produce the correct answer. Unfortunately, most LLM-powered systems resort to single results which, correct or not, users accept. Having the LLM produce multiple outputs may help identify disagreements or alternatives. However, it is not obvious how the user will interpret conflicts or inconsistencies. To this end, we investigate how users perceive the AI model and comprehend the generated information when they receive multiple, potentially inconsistent, outputs. Through a preliminary study, we identified five types of output inconsistencies. Based on these categories, we conducted a study (N=252) in which participants were given one or more LLM-generated passages to an information-seeking question. We found that inconsistency within multiple LLM-generated outputs lowered the participants' perceived AI capacity, while also increasing their comprehension of the given information. Specifically, we observed that this positive effect of inconsistencies was most significant for participants who read two passages, compared to those who read three. Based on these findings, we present design implications that, instead of regarding LLM output inconsistencies as a drawback, we can reveal the potential inconsistencies to transparently indicate the limitations of these models and promote critical LLM usage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05581v1-abstract-full').style.display = 'none'; document.getElementById('2405.05581v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to FAccT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.12416">arXiv:2404.12416</a> <span> [<a href="https://arxiv.org/pdf/2404.12416">pdf</a>, <a href="https://arxiv.org/format/2404.12416">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Plasma Physics">physics.plasm-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Full Shot Predictions for the DIII-D Tokamak via Deep Recurrent Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Char%2C+I">Ian Char</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Youngseog Chung</a>, <a href="/search/cs?searchtype=author&query=Abbate%2C+J">Joseph Abbate</a>, <a href="/search/cs?searchtype=author&query=Kolemen%2C+E">Egemen Kolemen</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+J">Jeff Schneider</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.12416v1-abstract-short" style="display: inline;"> Although tokamaks are one of the most promising devices for realizing nuclear fusion as an energy source, there are still key obstacles when it comes to understanding the dynamics of the plasma and controlling it. As such, it is crucial that high quality models are developed to assist in overcoming these obstacles. In this work, we take an entirely data driven approach to learn such a model. In pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12416v1-abstract-full').style.display = 'inline'; document.getElementById('2404.12416v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.12416v1-abstract-full" style="display: none;"> Although tokamaks are one of the most promising devices for realizing nuclear fusion as an energy source, there are still key obstacles when it comes to understanding the dynamics of the plasma and controlling it. As such, it is crucial that high quality models are developed to assist in overcoming these obstacles. In this work, we take an entirely data driven approach to learn such a model. In particular, we use historical data from the DIII-D tokamak to train a deep recurrent network that is able to predict the full time evolution of plasma discharges (or "shots"). Following this, we investigate how different training and inference procedures affect the quality and calibration of the shot predictions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12416v1-abstract-full').style.display = 'none'; document.getElementById('2404.12416v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.20103">arXiv:2403.20103</a> <span> [<a href="https://arxiv.org/pdf/2403.20103">pdf</a>, <a href="https://arxiv.org/format/2403.20103">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> NLP for Counterspeech against Hate: A Survey and How-To Guide </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bonaldi%2C+H">Helena Bonaldi</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yi-Ling Chung</a>, <a href="/search/cs?searchtype=author&query=Abercrombie%2C+G">Gavin Abercrombie</a>, <a href="/search/cs?searchtype=author&query=Guerini%2C+M">Marco Guerini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.20103v1-abstract-short" style="display: inline;"> In recent years, counterspeech has emerged as one of the most promising strategies to fight online hate. These non-escalatory responses tackle online abuse while preserving the freedom of speech of the users, and can have a tangible impact in reducing online and offline violence. Recently, there has been growing interest from the Natural Language Processing (NLP) community in addressing the challe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.20103v1-abstract-full').style.display = 'inline'; document.getElementById('2403.20103v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.20103v1-abstract-full" style="display: none;"> In recent years, counterspeech has emerged as one of the most promising strategies to fight online hate. These non-escalatory responses tackle online abuse while preserving the freedom of speech of the users, and can have a tangible impact in reducing online and offline violence. Recently, there has been growing interest from the Natural Language Processing (NLP) community in addressing the challenges of analysing, collecting, classifying, and automatically generating counterspeech, to reduce the huge burden of manually producing it. In particular, researchers have taken different directions in addressing these challenges, thus providing a variety of related tasks and resources. In this paper, we provide a guide for doing research on counterspeech, by describing - with detailed examples - the steps to undertake, and providing best practices that can be learnt from the NLP studies on this topic. Finally, we discuss open challenges and future directions of counterspeech research in NLP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.20103v1-abstract-full').style.display = 'none'; document.getElementById('2403.20103v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics (findings)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14117">arXiv:2403.14117</a> <span> [<a href="https://arxiv.org/pdf/2403.14117">pdf</a>, <a href="https://arxiv.org/format/2403.14117">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3613904.3642697">10.1145/3613904.3642697 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Design Space for Intelligent and Interactive Writing Assistants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+M">Mina Lee</a>, <a href="/search/cs?searchtype=author&query=Gero%2C+K+I">Katy Ilonka Gero</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+J+Y">John Joon Young Chung</a>, <a href="/search/cs?searchtype=author&query=Shum%2C+S+B">Simon Buckingham Shum</a>, <a href="/search/cs?searchtype=author&query=Raheja%2C+V">Vipul Raheja</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hua Shen</a>, <a href="/search/cs?searchtype=author&query=Venugopalan%2C+S">Subhashini Venugopalan</a>, <a href="/search/cs?searchtype=author&query=Wambsganss%2C+T">Thiemo Wambsganss</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">David Zhou</a>, <a href="/search/cs?searchtype=author&query=Alghamdi%2C+E+A">Emad A. Alghamdi</a>, <a href="/search/cs?searchtype=author&query=August%2C+T">Tal August</a>, <a href="/search/cs?searchtype=author&query=Bhat%2C+A">Avinash Bhat</a>, <a href="/search/cs?searchtype=author&query=Choksi%2C+M+Z">Madiha Zahrah Choksi</a>, <a href="/search/cs?searchtype=author&query=Dutta%2C+S">Senjuti Dutta</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J+L+C">Jin L. C. Guo</a>, <a href="/search/cs?searchtype=author&query=Hoque%2C+M+N">Md Naimul Hoque</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yewon Kim</a>, <a href="/search/cs?searchtype=author&query=Knight%2C+S">Simon Knight</a>, <a href="/search/cs?searchtype=author&query=Neshaei%2C+S+P">Seyed Parsa Neshaei</a>, <a href="/search/cs?searchtype=author&query=Sergeyuk%2C+A">Agnia Sergeyuk</a>, <a href="/search/cs?searchtype=author&query=Shibani%2C+A">Antonette Shibani</a>, <a href="/search/cs?searchtype=author&query=Shrivastava%2C+D">Disha Shrivastava</a>, <a href="/search/cs?searchtype=author&query=Shroff%2C+L">Lila Shroff</a>, <a href="/search/cs?searchtype=author&query=Stark%2C+J">Jessi Stark</a>, <a href="/search/cs?searchtype=author&query=Sterman%2C+S">Sarah Sterman</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14117v2-abstract-short" style="display: inline;"> In our era of rapid technological advancement, the research landscape for writing assistants has become increasingly fragmented across various research communities. We seek to address this challenge by proposing a design space as a structured way to examine and explore the multidimensional space of intelligent and interactive writing assistants. Through a large community collaboration, we explore… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14117v2-abstract-full').style.display = 'inline'; document.getElementById('2403.14117v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14117v2-abstract-full" style="display: none;"> In our era of rapid technological advancement, the research landscape for writing assistants has become increasingly fragmented across various research communities. We seek to address this challenge by proposing a design space as a structured way to examine and explore the multidimensional space of intelligent and interactive writing assistants. Through a large community collaboration, we explore five aspects of writing assistants: task, user, technology, interaction, and ecosystem. Within each aspect, we define dimensions (i.e., fundamental components of an aspect) and codes (i.e., potential options for each dimension) by systematically reviewing 115 papers. Our design space aims to offer researchers and designers a practical tool to navigate, comprehend, and compare the various possibilities of writing assistants, and aid in the envisioning and design of new writing assistants. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14117v2-abstract-full').style.display = 'none'; document.getElementById('2403.14117v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at CHI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.09159">arXiv:2403.09159</a> <span> [<a href="https://arxiv.org/pdf/2403.09159">pdf</a>, <a href="https://arxiv.org/ps/2403.09159">ps</a>, <a href="https://arxiv.org/format/2403.09159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Basque and Spanish Counter Narrative Generation: Data Creation and Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bengoetxea%2C+J">Jaione Bengoetxea</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yi-Ling Chung</a>, <a href="/search/cs?searchtype=author&query=Guerini%2C+M">Marco Guerini</a>, <a href="/search/cs?searchtype=author&query=Agerri%2C+R">Rodrigo Agerri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.09159v1-abstract-short" style="display: inline;"> Counter Narratives (CNs) are non-negative textual responses to Hate Speech (HS) aiming at defusing online hatred and mitigating its spreading across media. Despite the recent increase in HS content posted online, research on automatic CN generation has been relatively scarce and predominantly focused on English. In this paper, we present CONAN-EUS, a new Basque and Spanish dataset for CN generatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09159v1-abstract-full').style.display = 'inline'; document.getElementById('2403.09159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.09159v1-abstract-full" style="display: none;"> Counter Narratives (CNs) are non-negative textual responses to Hate Speech (HS) aiming at defusing online hatred and mitigating its spreading across media. Despite the recent increase in HS content posted online, research on automatic CN generation has been relatively scarce and predominantly focused on English. In this paper, we present CONAN-EUS, a new Basque and Spanish dataset for CN generation developed by means of Machine Translation (MT) and professional post-edition. Being a parallel corpus, also with respect to the original English CONAN, it allows to perform novel research on multilingual and crosslingual automatic generation of CNs. Our experiments on CN generation with mT5, a multilingual encoder-decoder model, show that generation greatly benefits from training on post-edited data, as opposed to relying on silver MT data only. These results are confirmed by their correlation with a qualitative manual evaluation, demonstrating that manually revised training data remains crucial for the quality of the generated CNs. Furthermore, multilingual data augmentation improves results over monolingual settings for structurally similar languages such as English and Spanish, while being detrimental for Basque, a language isolate. Similar findings occur in zero-shot crosslingual evaluations, where model transfer (fine-tuning in English and generating in a different target language) outperforms fine-tuning mT5 on machine translated data for Spanish but not for Basque. This provides an interesting insight into the asymmetry in the multilinguality of generative models, a challenging topic which is still open to research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09159v1-abstract-full').style.display = 'none'; document.getElementById('2403.09159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for the Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.07592">arXiv:2403.07592</a> <span> [<a href="https://arxiv.org/pdf/2403.07592">pdf</a>, <a href="https://arxiv.org/format/2403.07592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Accurate Spatial Gene Expression Prediction by integrating Multi-resolution features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Youngmin Chung</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+J+H">Ji Hun Ha</a>, <a href="/search/cs?searchtype=author&query=Im%2C+K+C">Kyeong Chan Im</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J+S">Joo Sang Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.07592v2-abstract-short" style="display: inline;"> Recent advancements in Spatial Transcriptomics (ST) technology have facilitated detailed gene expression analysis within tissue contexts. However, the high costs and methodological limitations of ST necessitate a more robust predictive model. In response, this paper introduces TRIPLEX, a novel deep learning framework designed to predict spatial gene expression from Whole Slide Images (WSIs). TRIPL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07592v2-abstract-full').style.display = 'inline'; document.getElementById('2403.07592v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.07592v2-abstract-full" style="display: none;"> Recent advancements in Spatial Transcriptomics (ST) technology have facilitated detailed gene expression analysis within tissue contexts. However, the high costs and methodological limitations of ST necessitate a more robust predictive model. In response, this paper introduces TRIPLEX, a novel deep learning framework designed to predict spatial gene expression from Whole Slide Images (WSIs). TRIPLEX uniquely harnesses multi-resolution features, capturing cellular morphology at individual spots, the local context around these spots, and the global tissue organization. By integrating these features through an effective fusion strategy, TRIPLEX achieves accurate gene expression prediction. Our comprehensive benchmark study, conducted on three public ST datasets and supplemented with Visium data from 10X Genomics, demonstrates that TRIPLEX outperforms current state-of-the-art models in Mean Squared Error (MSE), Mean Absolute Error (MAE), and Pearson Correlation Coefficient (PCC). The model's predictions align closely with ground truth gene expression profiles and tumor annotations, underscoring TRIPLEX's potential in advancing cancer diagnosis and treatment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07592v2-abstract-full').style.display = 'none'; document.getElementById('2403.07592v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.00439">arXiv:2403.00439</a> <span> [<a href="https://arxiv.org/pdf/2403.00439">pdf</a>, <a href="https://arxiv.org/format/2403.00439">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3613904.3642529">10.1145/3613904.3642529 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Authors' Values and Attitudes Towards AI-bridged Scalable Personalization of Creative Language Arts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taewook Kim</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Hyomin Han</a>, <a href="/search/cs?searchtype=author&query=Adar%2C+E">Eytan Adar</a>, <a href="/search/cs?searchtype=author&query=Kay%2C+M">Matthew Kay</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+J+Y">John Joon Young Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.00439v1-abstract-short" style="display: inline;"> Generative AI has the potential to create a new form of interactive media: AI-bridged creative language arts (CLA), which bridge the author and audience by personalizing the author's vision to the audience's context and taste at scale. However, it is unclear what the authors' values and attitudes would be regarding AI-bridged CLA. To identify these values and attitudes, we conducted an interview s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00439v1-abstract-full').style.display = 'inline'; document.getElementById('2403.00439v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.00439v1-abstract-full" style="display: none;"> Generative AI has the potential to create a new form of interactive media: AI-bridged creative language arts (CLA), which bridge the author and audience by personalizing the author's vision to the audience's context and taste at scale. However, it is unclear what the authors' values and attitudes would be regarding AI-bridged CLA. To identify these values and attitudes, we conducted an interview study with 18 authors across eight genres (e.g., poetry, comics) by presenting speculative but realistic AI-bridged CLA scenarios. We identified three benefits derived from the dynamics between author, artifact, and audience: those that 1) authors get from the process, 2) audiences get from the artifact, and 3) authors get from the audience. We found how AI-bridged CLA would either promote or reduce these benefits, along with authors' concerns. We hope our investigation hints at how AI can provide intriguing experiences to CLA audiences while promoting authors' values. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00439v1-abstract-full').style.display = 'none'; document.getElementById('2403.00439v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 6 figures, 2 tables. Accepted to ACM CHI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.11223">arXiv:2402.11223</a> <span> [<a href="https://arxiv.org/pdf/2402.11223">pdf</a>, <a href="https://arxiv.org/format/2402.11223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HEAL: Brain-inspired Hyperdimensional Efficient Active Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ni%2C+Y">Yang Ni</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Z">Zhuowen Zou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenjun Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanning Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+W+Y">William Youngwoo Chung</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+S">Samuel Cho</a>, <a href="/search/cs?searchtype=author&query=Krishnan%2C+R">Ranganath Krishnan</a>, <a href="/search/cs?searchtype=author&query=Mercati%2C+P">Pietro Mercati</a>, <a href="/search/cs?searchtype=author&query=Imani%2C+M">Mohsen Imani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.11223v1-abstract-short" style="display: inline;"> Drawing inspiration from the outstanding learning capability of our human brains, Hyperdimensional Computing (HDC) emerges as a novel computing paradigm, and it leverages high-dimensional vector presentation and operations for brain-like lightweight Machine Learning (ML). Practical deployments of HDC have significantly enhanced the learning efficiency compared to current deep ML methods on a broad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11223v1-abstract-full').style.display = 'inline'; document.getElementById('2402.11223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.11223v1-abstract-full" style="display: none;"> Drawing inspiration from the outstanding learning capability of our human brains, Hyperdimensional Computing (HDC) emerges as a novel computing paradigm, and it leverages high-dimensional vector presentation and operations for brain-like lightweight Machine Learning (ML). Practical deployments of HDC have significantly enhanced the learning efficiency compared to current deep ML methods on a broad spectrum of applications. However, boosting the data efficiency of HDC classifiers in supervised learning remains an open question. In this paper, we introduce Hyperdimensional Efficient Active Learning (HEAL), a novel Active Learning (AL) framework tailored for HDC classification. HEAL proactively annotates unlabeled data points via uncertainty and diversity-guided acquisition, leading to a more efficient dataset annotation and lowering labor costs. Unlike conventional AL methods that only support classifiers built upon deep neural networks (DNN), HEAL operates without the need for gradient or probabilistic computations. This allows it to be effortlessly integrated with any existing HDC classifier architecture. The key design of HEAL is a novel approach for uncertainty estimation in HDC classifiers through a lightweight HDC ensemble with prior hypervectors. Additionally, by exploiting hypervectors as prototypes (i.e., compact representations), we develop an extra metric for HEAL to select diverse samples within each batch for annotation. Our evaluation shows that HEAL surpasses a diverse set of baselines in AL quality and achieves notably faster acquisition than many BNN-powered or diversity-guided AL methods, recording 11 times to 40,000 times speedup in acquisition runtime per batch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11223v1-abstract-full').style.display = 'none'; document.getElementById('2402.11223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.08025">arXiv:2402.08025</a> <span> [<a href="https://arxiv.org/pdf/2402.08025">pdf</a>, <a href="https://arxiv.org/format/2402.08025">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond the Mud: Datasets and Benchmarks for Computer Vision in Off-Road Racing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tyo%2C+J">Jacob Tyo</a>, <a href="/search/cs?searchtype=author&query=Olarinre%2C+M">Motolani Olarinre</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Youngseog Chung</a>, <a href="/search/cs?searchtype=author&query=Lipton%2C+Z+C">Zachary C. Lipton</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.08025v1-abstract-short" style="display: inline;"> Despite significant progress in optical character recognition (OCR) and computer vision systems, robustly recognizing text and identifying people in images taken in unconstrained \emph{in-the-wild} environments remain an ongoing challenge. However, such obstacles must be overcome in practical applications of vision systems, such as identifying racers in photos taken during off-road racing events.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08025v1-abstract-full').style.display = 'inline'; document.getElementById('2402.08025v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.08025v1-abstract-full" style="display: none;"> Despite significant progress in optical character recognition (OCR) and computer vision systems, robustly recognizing text and identifying people in images taken in unconstrained \emph{in-the-wild} environments remain an ongoing challenge. However, such obstacles must be overcome in practical applications of vision systems, such as identifying racers in photos taken during off-road racing events. To this end, we introduce two new challenging real-world datasets - the off-road motorcycle Racer Number Dataset (RND) and the Muddy Racer re-iDentification Dataset (MUDD) - to highlight the shortcomings of current methods and drive advances in OCR and person re-identification (ReID) under extreme conditions. These two datasets feature over 6,300 images taken during off-road competitions which exhibit a variety of factors that undermine even modern vision systems, namely mud, complex poses, and motion blur. We establish benchmark performance on both datasets using state-of-the-art models. Off-the-shelf models transfer poorly, reaching only 15% end-to-end (E2E) F1 score on text spotting, and 33% rank-1 accuracy on ReID. Fine-tuning yields major improvements, bringing model performance to 53% F1 score for E2E text spotting and 79% rank-1 accuracy on ReID, but still falls short of good performance. Our analysis exposes open problems in real-world OCR and ReID that necessitate domain-targeted techniques. With these datasets and analysis of model limitations, we aim to foster innovations in handling real-world conditions like mud and complex poses to drive progress in robust computer vision. All data was sourced from PerformancePhoto.co, a website used by professional motorsports photographers, racers, and fans. The top-performing text spotting and ReID models are deployed on this platform to power real-time race photo search. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08025v1-abstract-full').style.display = 'none'; document.getElementById('2402.08025v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2311.09256</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.12295">arXiv:2401.12295</a> <span> [<a href="https://arxiv.org/pdf/2401.12295">pdf</a>, <a href="https://arxiv.org/format/2401.12295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Cheap Learning: Maximising Performance of Language Models for Social Data Science Using Minimal Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Castro-Gonzalez%2C+L">Leonardo Castro-Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yi-Ling Chung</a>, <a href="/search/cs?searchtype=author&query=Kirk%2C+H+R">Hannak Rose Kirk</a>, <a href="/search/cs?searchtype=author&query=Francis%2C+J">John Francis</a>, <a href="/search/cs?searchtype=author&query=Williams%2C+A+R">Angus R. Williams</a>, <a href="/search/cs?searchtype=author&query=Johansson%2C+P">Pica Johansson</a>, <a href="/search/cs?searchtype=author&query=Bright%2C+J">Jonathan Bright</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.12295v1-abstract-short" style="display: inline;"> The field of machine learning has recently made significant progress in reducing the requirements for labelled training data when building new models. These `cheaper' learning techniques hold significant potential for the social sciences, where development of large labelled training datasets is often a significant practical impediment to the use of machine learning for analytical tasks. In this ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12295v1-abstract-full').style.display = 'inline'; document.getElementById('2401.12295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.12295v1-abstract-full" style="display: none;"> The field of machine learning has recently made significant progress in reducing the requirements for labelled training data when building new models. These `cheaper' learning techniques hold significant potential for the social sciences, where development of large labelled training datasets is often a significant practical impediment to the use of machine learning for analytical tasks. In this article we review three `cheap' techniques that have developed in recent years: weak supervision, transfer learning and prompt engineering. For the latter, we also review the particular case of zero-shot prompting of large language models. For each technique we provide a guide of how it works and demonstrate its application across six different realistic social science applications (two different tasks paired with three different dataset makeups). We show good performance for all techniques, and in particular we demonstrate how prompting of large language models can achieve high accuracy at very low cost. Our results are accompanied by a code repository to make it easy for others to duplicate our work and use it in their own research. Overall, our article is intended to stimulate further uptake of these techniques in the social sciences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12295v1-abstract-full').style.display = 'none'; document.getElementById('2401.12295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">39 pages, 10 figures, 6 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; J.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.09294">arXiv:2401.09294</a> <span> [<a href="https://arxiv.org/pdf/2401.09294">pdf</a>, <a href="https://arxiv.org/format/2401.09294">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> T-FOLEY: A Controllable Waveform-Domain Diffusion Model for Temporal-Event-Guided Foley Sound Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yoonjin Chung</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Junwon Lee</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+J">Juhan Nam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.09294v1-abstract-short" style="display: inline;"> Foley sound, audio content inserted synchronously with videos, plays a critical role in the user experience of multimedia content. Recently, there has been active research in Foley sound synthesis, leveraging the advancements in deep generative models. However, such works mainly focus on replicating a single sound class or a textual sound description, neglecting temporal information, which is cruc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09294v1-abstract-full').style.display = 'inline'; document.getElementById('2401.09294v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.09294v1-abstract-full" style="display: none;"> Foley sound, audio content inserted synchronously with videos, plays a critical role in the user experience of multimedia content. Recently, there has been active research in Foley sound synthesis, leveraging the advancements in deep generative models. However, such works mainly focus on replicating a single sound class or a textual sound description, neglecting temporal information, which is crucial in the practical applications of Foley sound. We present T-Foley, a Temporal-event-guided waveform generation model for Foley sound synthesis. T-Foley generates high-quality audio using two conditions: the sound class and temporal event feature. For temporal conditioning, we devise a temporal event feature and a novel conditioning technique named Block-FiLM. T-Foley achieves superior performance in both objective and subjective evaluation metrics and generates Foley sound well-synchronized with the temporal events. Additionally, we showcase T-Foley's practical applications, particularly in scenarios involving vocal mimicry for temporal event control. We show the demo on our companion website. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09294v1-abstract-full').style.display = 'none'; document.getElementById('2401.09294v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.08117">arXiv:2401.08117</a> <span> [<a href="https://arxiv.org/pdf/2401.08117">pdf</a>, <a href="https://arxiv.org/format/2401.08117">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> E2HQV: High-Quality Video Generation from Event Camera via Theory-Inspired Model-Aided Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qiang Qu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yiran Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoming Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y+Y">Yuk Ying Chung</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.08117v1-abstract-short" style="display: inline;"> The bio-inspired event cameras or dynamic vision sensors are capable of asynchronously capturing per-pixel brightness changes (called event-streams) in high temporal resolution and high dynamic range. However, the non-structural spatial-temporal event-streams make it challenging for providing intuitive visualization with rich semantic information for human vision. It calls for events-to-video (E2V… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08117v1-abstract-full').style.display = 'inline'; document.getElementById('2401.08117v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.08117v1-abstract-full" style="display: none;"> The bio-inspired event cameras or dynamic vision sensors are capable of asynchronously capturing per-pixel brightness changes (called event-streams) in high temporal resolution and high dynamic range. However, the non-structural spatial-temporal event-streams make it challenging for providing intuitive visualization with rich semantic information for human vision. It calls for events-to-video (E2V) solutions which take event-streams as input and generate high quality video frames for intuitive visualization. However, current solutions are predominantly data-driven without considering the prior knowledge of the underlying statistics relating event-streams and video frames. It highly relies on the non-linearity and generalization capability of the deep neural networks, thus, is struggling on reconstructing detailed textures when the scenes are complex. In this work, we propose \textbf{E2HQV}, a novel E2V paradigm designed to produce high-quality video frames from events. This approach leverages a model-aided deep learning framework, underpinned by a theory-inspired E2V model, which is meticulously derived from the fundamental imaging principles of event cameras. To deal with the issue of state-reset in the recurrent components of E2HQV, we also design a temporal shift embedding module to further improve the quality of the video frames. Comprehensive evaluations on the real world event camera datasets validate our approach, with E2HQV, notably outperforming state-of-the-art approaches, e.g., surpassing the second best by over 40\% for some evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08117v1-abstract-full').style.display = 'none'; document.getElementById('2401.08117v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in AAAI2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.00740">arXiv:2401.00740</a> <span> [<a href="https://arxiv.org/pdf/2401.00740">pdf</a>, <a href="https://arxiv.org/format/2401.00740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Subspace Isolation: Many-to-Many Transformer for Light Field Image Super-resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Z+Z">Zeke Zexi Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoming Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+V+Y+Y">Vera Yuk Ying Chung</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yiran Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.00740v1-abstract-short" style="display: inline;"> The effective extraction of spatial-angular features plays a crucial role in light field image super-resolution (LFSR) tasks, and the introduction of convolution and Transformers leads to significant improvement in this area. Nevertheless, due to the large 4D data volume of light field images, many existing methods opted to decompose the data into a number of lower-dimensional subspaces and perfor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00740v1-abstract-full').style.display = 'inline'; document.getElementById('2401.00740v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.00740v1-abstract-full" style="display: none;"> The effective extraction of spatial-angular features plays a crucial role in light field image super-resolution (LFSR) tasks, and the introduction of convolution and Transformers leads to significant improvement in this area. Nevertheless, due to the large 4D data volume of light field images, many existing methods opted to decompose the data into a number of lower-dimensional subspaces and perform Transformers in each sub-space individually. As a side effect, these methods inadvertently restrict the self-attention mechanisms to a One-to-One scheme accessing only a limited subset of LF data, explicitly preventing comprehensive optimization on all spatial and angular cues. In this paper, we identify this limitation as subspace isolation and introduce a novel Many-to-Many Transformer (M2MT) to address it. M2MT aggregates angular information in the spatial subspace before performing the self-attention mechanism. It enables complete access to all information across all sub-aperture images (SAIs) in a light field image. Consequently, M2MT is enabled to comprehensively capture long-range correlation dependencies. With M2MT as the pivotal component, we develop a simple yet effective M2MT network for LFSR. Our experimental results demonstrate that M2MT achieves state-of-the-art performance across various public datasets. We further conduct in-depth analysis using local attribution maps (LAM) to obtain visual interpretability, and the results validate that M2MT is empowered with a truly non-local context in both spatial and angular subspaces to mitigate subspace isolation and acquire effective spatial-angular representation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00740v1-abstract-full').style.display = 'none'; document.getElementById('2401.00740v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11949">arXiv:2312.11949</a> <span> [<a href="https://arxiv.org/pdf/2312.11949">pdf</a>, <a href="https://arxiv.org/format/2312.11949">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> CreativeConnect: Supporting Reference Recombination for Graphic Design Ideation with Generative AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+D">DaEun Choi</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+S">Sumin Hong</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jeongeon Park</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+J+Y">John Joon Young Chung</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Juho Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11949v2-abstract-short" style="display: inline;"> Graphic designers often get inspiration through the recombination of references. Our formative study (N=6) reveals that graphic designers focus on conceptual keywords during this process, and want support for discovering the keywords, expanding them, and exploring diverse recombination options of them, while still having room for designers' creativity. We propose CreativeConnect, a system with gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11949v2-abstract-full').style.display = 'inline'; document.getElementById('2312.11949v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11949v2-abstract-full" style="display: none;"> Graphic designers often get inspiration through the recombination of references. Our formative study (N=6) reveals that graphic designers focus on conceptual keywords during this process, and want support for discovering the keywords, expanding them, and exploring diverse recombination options of them, while still having room for designers' creativity. We propose CreativeConnect, a system with generative AI pipelines that helps users discover useful elements from the reference image using keywords, recommends relevant keywords, generates diverse recombination options with user-selected keywords, and shows recombinations as sketches with text descriptions. Our user study (N=16) showed that CreativeConnect helped users discover keywords from the reference and generate multiple ideas based on them, ultimately helping users produce more design ideas with higher self-reported creativity compared to the baseline system without generative pipelines. While CreativeConnect was shown effective in ideation, we discussed how CreativeConnect can be extended to support other types of tasks in creativity support. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11949v2-abstract-full').style.display = 'none'; document.getElementById('2312.11949v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.06279">arXiv:2312.06279</a> <span> [<a href="https://arxiv.org/pdf/2312.06279">pdf</a>, <a href="https://arxiv.org/format/2312.06279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Regional Correlation Aided Mobile Traffic Prediction with Spatiotemporal Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">JeongJun Park</a>, <a href="/search/cs?searchtype=author&query=Mwasinga%2C+L+J">Lusungu J. Mwasinga</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Huigyu Yang</a>, <a href="/search/cs?searchtype=author&query=Raza%2C+S+M">Syed M. Raza</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc-Tai Le</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Moonseong Kim</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+M+Y">Min Young Chung</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+H">Hyunseung Choo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.06279v1-abstract-short" style="display: inline;"> Mobile traffic data in urban regions shows differentiated patterns during different hours of the day. The exploitation of these patterns enables highly accurate mobile traffic prediction for proactive network management. However, recent Deep Learning (DL) driven studies have only exploited spatiotemporal features and have ignored the geographical correlations, causing high complexity and erroneous… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06279v1-abstract-full').style.display = 'inline'; document.getElementById('2312.06279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.06279v1-abstract-full" style="display: none;"> Mobile traffic data in urban regions shows differentiated patterns during different hours of the day. The exploitation of these patterns enables highly accurate mobile traffic prediction for proactive network management. However, recent Deep Learning (DL) driven studies have only exploited spatiotemporal features and have ignored the geographical correlations, causing high complexity and erroneous mobile traffic predictions. This paper addresses these limitations by proposing an enhanced mobile traffic prediction scheme that combines the clustering strategy of daily mobile traffic peak time and novel multi Temporal Convolutional Network with a Long Short Term Memory (multi TCN-LSTM) model. The mobile network cells that exhibit peak traffic during the same hour of the day are clustered together. Our experiments on large-scale real-world mobile traffic data show up to 28% performance improvement compared to state-of-the-art studies, which confirms the efficacy and viability of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06279v1-abstract-full').style.display = 'none'; document.getElementById('2312.06279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 5 figures, 1 table. This paper is already accepted on IEEE Consumer Communications & Networking Conference(CCNC) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05187">arXiv:2312.05187</a> <span> [<a href="https://arxiv.org/pdf/2312.05187">pdf</a>, <a href="https://arxiv.org/format/2312.05187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Seamless: Multilingual Expressive and Streaming Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Communication%2C+S">Seamless Communication</a>, <a href="/search/cs?searchtype=author&query=Barrault%2C+L">Lo茂c Barrault</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yu-An Chung</a>, <a href="/search/cs?searchtype=author&query=Meglioli%2C+M+C">Mariano Coria Meglioli</a>, <a href="/search/cs?searchtype=author&query=Dale%2C+D">David Dale</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+N">Ning Dong</a>, <a href="/search/cs?searchtype=author&query=Duppenthaler%2C+M">Mark Duppenthaler</a>, <a href="/search/cs?searchtype=author&query=Duquenne%2C+P">Paul-Ambroise Duquenne</a>, <a href="/search/cs?searchtype=author&query=Ellis%2C+B">Brian Ellis</a>, <a href="/search/cs?searchtype=author&query=Elsahar%2C+H">Hady Elsahar</a>, <a href="/search/cs?searchtype=author&query=Haaheim%2C+J">Justin Haaheim</a>, <a href="/search/cs?searchtype=author&query=Hoffman%2C+J">John Hoffman</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+M">Min-Jae Hwang</a>, <a href="/search/cs?searchtype=author&query=Inaguma%2C+H">Hirofumi Inaguma</a>, <a href="/search/cs?searchtype=author&query=Klaiber%2C+C">Christopher Klaiber</a>, <a href="/search/cs?searchtype=author&query=Kulikov%2C+I">Ilia Kulikov</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengwei Li</a>, <a href="/search/cs?searchtype=author&query=Licht%2C+D">Daniel Licht</a>, <a href="/search/cs?searchtype=author&query=Maillard%2C+J">Jean Maillard</a>, <a href="/search/cs?searchtype=author&query=Mavlyutov%2C+R">Ruslan Mavlyutov</a>, <a href="/search/cs?searchtype=author&query=Rakotoarison%2C+A">Alice Rakotoarison</a>, <a href="/search/cs?searchtype=author&query=Sadagopan%2C+K+R">Kaushik Ram Sadagopan</a>, <a href="/search/cs?searchtype=author&query=Ramakrishnan%2C+A">Abinesh Ramakrishnan</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+T">Tuan Tran</a>, <a href="/search/cs?searchtype=author&query=Wenzek%2C+G">Guillaume Wenzek</a> , et al. (40 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05187v1-abstract-short" style="display: inline;"> Large-scale automatic speech translation systems today lack key features that help machine-mediated communication feel seamless when compared to human-to-human dialogue. In this work, we introduce a family of models that enable end-to-end expressive and multilingual translations in a streaming fashion. First, we contribute an improved version of the massively multilingual and multimodal SeamlessM4… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05187v1-abstract-full').style.display = 'inline'; document.getElementById('2312.05187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05187v1-abstract-full" style="display: none;"> Large-scale automatic speech translation systems today lack key features that help machine-mediated communication feel seamless when compared to human-to-human dialogue. In this work, we introduce a family of models that enable end-to-end expressive and multilingual translations in a streaming fashion. First, we contribute an improved version of the massively multilingual and multimodal SeamlessM4T model-SeamlessM4T v2. This newer model, incorporating an updated UnitY2 framework, was trained on more low-resource language data. SeamlessM4T v2 provides the foundation on which our next two models are initiated. SeamlessExpressive enables translation that preserves vocal styles and prosody. Compared to previous efforts in expressive speech research, our work addresses certain underexplored aspects of prosody, such as speech rate and pauses, while also preserving the style of one's voice. As for SeamlessStreaming, our model leverages the Efficient Monotonic Multihead Attention mechanism to generate low-latency target translations without waiting for complete source utterances. As the first of its kind, SeamlessStreaming enables simultaneous speech-to-speech/text translation for multiple source and target languages. To ensure that our models can be used safely and responsibly, we implemented the first known red-teaming effort for multimodal machine translation, a system for the detection and mitigation of added toxicity, a systematic evaluation of gender bias, and an inaudible localized watermarking mechanism designed to dampen the impact of deepfakes. Consequently, we bring major components from SeamlessExpressive and SeamlessStreaming together to form Seamless, the first publicly available system that unlocks expressive cross-lingual communication in real-time. The contributions to this work are publicly released and accessible at https://github.com/facebookresearch/seamless_communication <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05187v1-abstract-full').style.display = 'none'; document.getElementById('2312.05187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.09256">arXiv:2311.09256</a> <span> [<a href="https://arxiv.org/pdf/2311.09256">pdf</a>, <a href="https://arxiv.org/format/2311.09256">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reading Between the Mud: A Challenging Motorcycle Racer Number Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tyo%2C+J">Jacob Tyo</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Youngseog Chung</a>, <a href="/search/cs?searchtype=author&query=Olarinre%2C+M">Motolani Olarinre</a>, <a href="/search/cs?searchtype=author&query=Lipton%2C+Z+C">Zachary C. Lipton</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.09256v1-abstract-short" style="display: inline;"> This paper introduces the off-road motorcycle Racer number Dataset (RnD), a new challenging dataset for optical character recognition (OCR) research. RnD contains 2,411 images from professional motorsports photographers that depict motorcycle racers in off-road competitions. The images exhibit a wide variety of factors that make OCR difficult, including mud occlusions, motion blur, non-standard fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09256v1-abstract-full').style.display = 'inline'; document.getElementById('2311.09256v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.09256v1-abstract-full" style="display: none;"> This paper introduces the off-road motorcycle Racer number Dataset (RnD), a new challenging dataset for optical character recognition (OCR) research. RnD contains 2,411 images from professional motorsports photographers that depict motorcycle racers in off-road competitions. The images exhibit a wide variety of factors that make OCR difficult, including mud occlusions, motion blur, non-standard fonts, glare, complex backgrounds, etc. The dataset has 5,578 manually annotated bounding boxes around visible motorcycle numbers, along with transcribed digits and letters. Our experiments benchmark leading OCR algorithms and reveal an end-to-end F1 score of only 0.527 on RnD, even after fine-tuning. Analysis of performance on different occlusion types shows mud as the primary challenge, degrading accuracy substantially compared to normal conditions. But the models struggle with other factors including glare, blur, shadows, and dust. Analysis exposes substantial room for improvement and highlights failure cases of existing models. RnD represents a valuable new benchmark to drive innovation in real-world OCR capabilities. The authors hope the community will build upon this dataset and baseline experiments to make progress on the open problem of robustly recognizing text in unconstrained natural environments. The dataset is available at https://github.com/JacobTyo/SwinTextSpotter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09256v1-abstract-full').style.display = 'none'; document.getElementById('2311.09256v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.08488">arXiv:2311.08488</a> <span> [<a href="https://arxiv.org/pdf/2311.08488">pdf</a>, <a href="https://arxiv.org/format/2311.08488">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MUDD: A New Re-Identification Dataset with Efficient Annotation for Off-Road Racers in Extreme Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tyo%2C+J">Jacob Tyo</a>, <a href="/search/cs?searchtype=author&query=Olarinre%2C+M">Motolani Olarinre</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Youngseog Chung</a>, <a href="/search/cs?searchtype=author&query=Lipton%2C+Z+C">Zachary C. Lipton</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.08488v1-abstract-short" style="display: inline;"> Re-identifying individuals in unconstrained environments remains an open challenge in computer vision. We introduce the Muddy Racer re-IDentification Dataset (MUDD), the first large-scale benchmark for matching identities of motorcycle racers during off-road competitions. MUDD exhibits heavy mud occlusion, motion blurring, complex poses, and extreme lighting conditions previously unseen in existin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08488v1-abstract-full').style.display = 'inline'; document.getElementById('2311.08488v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.08488v1-abstract-full" style="display: none;"> Re-identifying individuals in unconstrained environments remains an open challenge in computer vision. We introduce the Muddy Racer re-IDentification Dataset (MUDD), the first large-scale benchmark for matching identities of motorcycle racers during off-road competitions. MUDD exhibits heavy mud occlusion, motion blurring, complex poses, and extreme lighting conditions previously unseen in existing re-id datasets. We present an annotation methodology incorporating auxiliary information that reduced labeling time by over 65%. We establish benchmark performance using state-of-the-art re-id models including OSNet and ResNet-50. Without fine-tuning, the best models achieve only 33% Rank-1 accuracy. Fine-tuning on MUDD boosts results to 79% Rank-1, but significant room for improvement remains. We analyze the impact of real-world factors including mud, pose, lighting, and more. Our work exposes open problems in re-identifying individuals under extreme conditions. We hope MUDD serves as a diverse and challenging benchmark to spur progress in robust re-id, especially for computer vision applications in emerging sports analytics. All code and data can be found at https://github.com/JacobTyo/MUDD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08488v1-abstract-full').style.display = 'none'; document.getElementById('2311.08488v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.07707">arXiv:2309.07707</a> <span> [<a href="https://arxiv.org/pdf/2309.07707">pdf</a>, <a href="https://arxiv.org/format/2309.07707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CoLLD: Contrastive Layer-to-layer Distillation for Compressing Multilingual Pre-trained Speech Encoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+H">Heng-Jui Chang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+N">Ning Dong</a>, <a href="/search/cs?searchtype=author&query=Mavlyutov%2C+R">Ruslan Mavlyutov</a>, <a href="/search/cs?searchtype=author&query=Popuri%2C+S">Sravya Popuri</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yu-An Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.07707v2-abstract-short" style="display: inline;"> Large-scale self-supervised pre-trained speech encoders outperform conventional approaches in speech recognition and translation tasks. Due to the high cost of developing these large models, building new encoders for new tasks and deploying them to on-device applications are infeasible. Prior studies propose model compression methods to address this issue, but those works focus on smaller models a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07707v2-abstract-full').style.display = 'inline'; document.getElementById('2309.07707v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.07707v2-abstract-full" style="display: none;"> Large-scale self-supervised pre-trained speech encoders outperform conventional approaches in speech recognition and translation tasks. Due to the high cost of developing these large models, building new encoders for new tasks and deploying them to on-device applications are infeasible. Prior studies propose model compression methods to address this issue, but those works focus on smaller models and less realistic tasks. Thus, we propose Contrastive Layer-to-layer Distillation (CoLLD), a novel knowledge distillation method to compress pre-trained speech encoders by leveraging masked prediction and contrastive learning to train student models to copy the behavior of a large teacher model. CoLLD outperforms prior methods and closes the gap between small and large models on multilingual speech-to-text translation and recognition benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07707v2-abstract-full').style.display = 'none'; document.getElementById('2309.07707v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.11596">arXiv:2308.11596</a> <span> [<a href="https://arxiv.org/pdf/2308.11596">pdf</a>, <a href="https://arxiv.org/format/2308.11596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SeamlessM4T: Massively Multilingual & Multimodal Machine Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Communication%2C+S">Seamless Communication</a>, <a href="/search/cs?searchtype=author&query=Barrault%2C+L">Lo茂c Barrault</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yu-An Chung</a>, <a href="/search/cs?searchtype=author&query=Meglioli%2C+M+C">Mariano Cora Meglioli</a>, <a href="/search/cs?searchtype=author&query=Dale%2C+D">David Dale</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+N">Ning Dong</a>, <a href="/search/cs?searchtype=author&query=Duquenne%2C+P">Paul-Ambroise Duquenne</a>, <a href="/search/cs?searchtype=author&query=Elsahar%2C+H">Hady Elsahar</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+H">Hongyu Gong</a>, <a href="/search/cs?searchtype=author&query=Heffernan%2C+K">Kevin Heffernan</a>, <a href="/search/cs?searchtype=author&query=Hoffman%2C+J">John Hoffman</a>, <a href="/search/cs?searchtype=author&query=Klaiber%2C+C">Christopher Klaiber</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengwei Li</a>, <a href="/search/cs?searchtype=author&query=Licht%2C+D">Daniel Licht</a>, <a href="/search/cs?searchtype=author&query=Maillard%2C+J">Jean Maillard</a>, <a href="/search/cs?searchtype=author&query=Rakotoarison%2C+A">Alice Rakotoarison</a>, <a href="/search/cs?searchtype=author&query=Sadagopan%2C+K+R">Kaushik Ram Sadagopan</a>, <a href="/search/cs?searchtype=author&query=Wenzek%2C+G">Guillaume Wenzek</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+E">Ethan Ye</a>, <a href="/search/cs?searchtype=author&query=Akula%2C+B">Bapi Akula</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peng-Jen Chen</a>, <a href="/search/cs?searchtype=author&query=Hachem%2C+N+E">Naji El Hachem</a>, <a href="/search/cs?searchtype=author&query=Ellis%2C+B">Brian Ellis</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+G+M">Gabriel Mejia Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Haaheim%2C+J">Justin Haaheim</a> , et al. (43 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.11596v3-abstract-short" style="display: inline;"> What does it take to create the Babel Fish, a tool that can help individuals translate speech between any two languages? While recent breakthroughs in text-based models have pushed machine translation coverage beyond 200 languages, unified speech-to-speech translation models have yet to achieve similar strides. More specifically, conventional speech-to-speech translation systems rely on cascaded s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11596v3-abstract-full').style.display = 'inline'; document.getElementById('2308.11596v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.11596v3-abstract-full" style="display: none;"> What does it take to create the Babel Fish, a tool that can help individuals translate speech between any two languages? While recent breakthroughs in text-based models have pushed machine translation coverage beyond 200 languages, unified speech-to-speech translation models have yet to achieve similar strides. More specifically, conventional speech-to-speech translation systems rely on cascaded systems that perform translation progressively, putting high-performing unified systems out of reach. To address these gaps, we introduce SeamlessM4T, a single model that supports speech-to-speech translation, speech-to-text translation, text-to-speech translation, text-to-text translation, and automatic speech recognition for up to 100 languages. To build this, we used 1 million hours of open speech audio data to learn self-supervised speech representations with w2v-BERT 2.0. Subsequently, we created a multimodal corpus of automatically aligned speech translations. Filtered and combined with human-labeled and pseudo-labeled data, we developed the first multilingual system capable of translating from and into English for both speech and text. On FLEURS, SeamlessM4T sets a new standard for translations into multiple target languages, achieving an improvement of 20% BLEU over the previous SOTA in direct speech-to-text translation. Compared to strong cascaded models, SeamlessM4T improves the quality of into-English translation by 1.3 BLEU points in speech-to-text and by 2.6 ASR-BLEU points in speech-to-speech. Tested for robustness, our system performs better against background noises and speaker variations in speech-to-text tasks compared to the current SOTA model. Critically, we evaluated SeamlessM4T on gender bias and added toxicity to assess translation safety. Finally, all contributions in this work are open-sourced and accessible at https://github.com/facebookresearch/seamless_communication <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11596v3-abstract-full').style.display = 'none'; document.getElementById('2308.11596v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.05184">arXiv:2308.05184</a> <span> [<a href="https://arxiv.org/pdf/2308.05184">pdf</a>, <a href="https://arxiv.org/format/2308.05184">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3586183.3606777">10.1145/3586183.3606777 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> PromptPaint: Steering Text-to-Image Generation Through Paint Medium-like Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+J+J+Y">John Joon Young Chung</a>, <a href="/search/cs?searchtype=author&query=Adar%2C+E">Eytan Adar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.05184v1-abstract-short" style="display: inline;"> While diffusion-based text-to-image (T2I) models provide a simple and powerful way to generate images, guiding this generation remains a challenge. For concepts that are difficult to describe through language, users may struggle to create prompts. Moreover, many of these models are built as end-to-end systems, lacking support for iterative shaping of the image. In response, we introduce PromptPain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05184v1-abstract-full').style.display = 'inline'; document.getElementById('2308.05184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.05184v1-abstract-full" style="display: none;"> While diffusion-based text-to-image (T2I) models provide a simple and powerful way to generate images, guiding this generation remains a challenge. For concepts that are difficult to describe through language, users may struggle to create prompts. Moreover, many of these models are built as end-to-end systems, lacking support for iterative shaping of the image. In response, we introduce PromptPaint, which combines T2I generation with interactions that model how we use colored paints. PromptPaint allows users to go beyond language to mix prompts that express challenging concepts. Just as we iteratively tune colors through layered placements of paint on a physical canvas, PromptPaint similarly allows users to apply different prompts to different canvas areas and times of the generative process. Through a set of studies, we characterize different approaches for mixing prompts, design trade-offs, and socio-technical challenges for generative models. With PromptPaint we provide insight into future steerable generative tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05184v1-abstract-full').style.display = 'none'; document.getElementById('2308.05184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to UIST2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.16811">arXiv:2307.16811</a> <span> [<a href="https://arxiv.org/pdf/2307.16811">pdf</a>, <a href="https://arxiv.org/format/2307.16811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> DoDo Learning: DOmain-DemOgraphic Transfer in Language Models for Detecting Abuse Targeted at Public Figures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Williams%2C+A+R">Angus R. Williams</a>, <a href="/search/cs?searchtype=author&query=Kirk%2C+H+R">Hannah Rose Kirk</a>, <a href="/search/cs?searchtype=author&query=Burke%2C+L">Liam Burke</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yi-Ling Chung</a>, <a href="/search/cs?searchtype=author&query=Debono%2C+I">Ivan Debono</a>, <a href="/search/cs?searchtype=author&query=Johansson%2C+P">Pica Johansson</a>, <a href="/search/cs?searchtype=author&query=Stevens%2C+F">Francesca Stevens</a>, <a href="/search/cs?searchtype=author&query=Bright%2C+J">Jonathan Bright</a>, <a href="/search/cs?searchtype=author&query=Hale%2C+S+A">Scott A. Hale</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.16811v3-abstract-short" style="display: inline;"> Public figures receive a disproportionate amount of abuse on social media, impacting their active participation in public life. Automated systems can identify abuse at scale but labelling training data is expensive, complex and potentially harmful. So, it is desirable that systems are efficient and generalisable, handling both shared and specific aspects of online abuse. We explore the dynamics of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16811v3-abstract-full').style.display = 'inline'; document.getElementById('2307.16811v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.16811v3-abstract-full" style="display: none;"> Public figures receive a disproportionate amount of abuse on social media, impacting their active participation in public life. Automated systems can identify abuse at scale but labelling training data is expensive, complex and potentially harmful. So, it is desirable that systems are efficient and generalisable, handling both shared and specific aspects of online abuse. We explore the dynamics of cross-group text classification in order to understand how well classifiers trained on one domain or demographic can transfer to others, with a view to building more generalisable abuse classifiers. We fine-tune language models to classify tweets targeted at public figures across DOmains (sport and politics) and DemOgraphics (women and men) using our novel DODO dataset, containing 28,000 labelled entries, split equally across four domain-demographic pairs. We find that (i) small amounts of diverse data are hugely beneficial to generalisation and model adaptation; (ii) models transfer more easily across demographics but models trained on cross-domain data are more generalisable; (iii) some groups contribute more to generalisability than others; and (iv) dataset similarity is a signal of transferability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16811v3-abstract-full').style.display = 'none'; document.getElementById('2307.16811v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 7 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.04761">arXiv:2307.04761</a> <span> [<a href="https://arxiv.org/pdf/2307.04761">pdf</a>, <a href="https://arxiv.org/format/2307.04761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Understanding Counterspeech for Online Harm Mitigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yi-Ling Chung</a>, <a href="/search/cs?searchtype=author&query=Abercrombie%2C+G">Gavin Abercrombie</a>, <a href="/search/cs?searchtype=author&query=Enock%2C+F">Florence Enock</a>, <a href="/search/cs?searchtype=author&query=Bright%2C+J">Jonathan Bright</a>, <a href="/search/cs?searchtype=author&query=Rieser%2C+V">Verena Rieser</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.04761v1-abstract-short" style="display: inline;"> Counterspeech offers direct rebuttals to hateful speech by challenging perpetrators of hate and showing support to targets of abuse. It provides a promising alternative to more contentious measures, such as content moderation and deplatforming, by contributing a greater amount of positive online speech rather than attempting to mitigate harmful content through removal. Advances in the development… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04761v1-abstract-full').style.display = 'inline'; document.getElementById('2307.04761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.04761v1-abstract-full" style="display: none;"> Counterspeech offers direct rebuttals to hateful speech by challenging perpetrators of hate and showing support to targets of abuse. It provides a promising alternative to more contentious measures, such as content moderation and deplatforming, by contributing a greater amount of positive online speech rather than attempting to mitigate harmful content through removal. Advances in the development of large language models mean that the process of producing counterspeech could be made more efficient by automating its generation, which would enable large-scale online campaigns. However, we currently lack a systematic understanding of several important factors relating to the efficacy of counterspeech for hate mitigation, such as which types of counterspeech are most effective, what are the optimal conditions for implementation, and which specific effects of hate it can best ameliorate. This paper aims to fill this gap by systematically reviewing counterspeech research in the social sciences and comparing methodologies and findings with computer science efforts in automatic counterspeech generation. By taking this multi-disciplinary view, we identify promising future directions in both fields. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04761v1-abstract-full').style.display = 'none'; document.getElementById('2307.04761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 2 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.15189">arXiv:2306.15189</a> <span> [<a href="https://arxiv.org/pdf/2306.15189">pdf</a>, <a href="https://arxiv.org/ps/2306.15189">ps</a>, <a href="https://arxiv.org/format/2306.15189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FBA-Net: Foreground and Background Aware Contrastive Learning for Semi-Supervised Atrium Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yunsung Chung</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+C">Chanho Lim</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chao Huang</a>, <a href="/search/cs?searchtype=author&query=Marrouche%2C+N">Nassir Marrouche</a>, <a href="/search/cs?searchtype=author&query=Hamm%2C+J">Jihun Hamm</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.15189v1-abstract-short" style="display: inline;"> Medical image segmentation of gadolinium enhancement magnetic resonance imaging (GE MRI) is an important task in clinical applications. However, manual annotation is time-consuming and requires specialized expertise. Semi-supervised segmentation methods that leverage both labeled and unlabeled data have shown promise, with contrastive learning emerging as a particularly effective approach. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15189v1-abstract-full').style.display = 'inline'; document.getElementById('2306.15189v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.15189v1-abstract-full" style="display: none;"> Medical image segmentation of gadolinium enhancement magnetic resonance imaging (GE MRI) is an important task in clinical applications. However, manual annotation is time-consuming and requires specialized expertise. Semi-supervised segmentation methods that leverage both labeled and unlabeled data have shown promise, with contrastive learning emerging as a particularly effective approach. In this paper, we propose a contrastive learning strategy of foreground and background representations for semi-supervised 3D medical image segmentation (FBA-Net). Specifically, we leverage the contrastive loss to learn representations of both the foreground and background regions in the images. By training the network to distinguish between foreground-background pairs, we aim to learn a representation that can effectively capture the anatomical structures of interest. Experiments on three medical segmentation datasets demonstrate state-of-the-art performance. Notably, our method achieves a Dice score of 91.31% with only 20% labeled data, which is remarkably close to the 91.62% score of the fully supervised method that uses 100% labeled data on the left atrium dataset. Our framework has the potential to advance the field of semi-supervised 3D medical image segmentation and enable more efficient and accurate analysis of medical images with a limited amount of annotated labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15189v1-abstract-full').style.display = 'none'; document.getElementById('2306.15189v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 2 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chung%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chung%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chung%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chung%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chung%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository