Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 79 results for author: <span class="mathjax">Saito, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Saito%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Saito, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Saito%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Saito, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Saito%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Saito%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Saito%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23730">arXiv:2410.23730</a> <span> [<a href="https://arxiv.org/pdf/2410.23730">pdf</a>, <a href="https://arxiv.org/format/2410.23730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Analysis of GPT-4V's Performance on Fashion Aesthetic Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hirakawa%2C+Y">Yuki Hirakawa</a>, <a href="/search/cs?searchtype=author&query=Wada%2C+T">Takashi Wada</a>, <a href="/search/cs?searchtype=author&query=Morishita%2C+K">Kazuya Morishita</a>, <a href="/search/cs?searchtype=author&query=Shimizu%2C+R">Ryotaro Shimizu</a>, <a href="/search/cs?searchtype=author&query=Furusawa%2C+T">Takuya Furusawa</a>, <a href="/search/cs?searchtype=author&query=Kham%2C+S+H">Sai Htaung Kham</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23730v1-abstract-short" style="display: inline;"> Fashion aesthetic evaluation is the task of estimating how well the outfits worn by individuals in images suit them. In this work, we examine the zero-shot performance of GPT-4V on this task for the first time. We show that its predictions align fairly well with human judgments on our datasets, and also find that it struggles with ranking outfits in similar colors. The code is available at https:/… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23730v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23730v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23730v1-abstract-full" style="display: none;"> Fashion aesthetic evaluation is the task of estimating how well the outfits worn by individuals in images suit them. In this work, we examine the zero-shot performance of GPT-4V on this task for the first time. We show that its predictions align fairly well with human judgments on our datasets, and also find that it struggles with ranking outfits in similar colors. The code is available at https://github.com/st-tech/gpt4v-fashion-aesthetic-evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23730v1-abstract-full').style.display = 'none'; document.getElementById('2410.23730v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15532">arXiv:2410.15532</a> <span> [<a href="https://arxiv.org/pdf/2410.15532">pdf</a>, <a href="https://arxiv.org/ps/2410.15532">ps</a>, <a href="https://arxiv.org/format/2410.15532">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Construction and Analysis of Impression Caption Dataset for Environmental Sounds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Okamoto%2C+Y">Yuki Okamoto</a>, <a href="/search/cs?searchtype=author&query=Nagase%2C+R">Ryotaro Nagase</a>, <a href="/search/cs?searchtype=author&query=Okamoto%2C+M">Minami Okamoto</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Imoto%2C+K">Keisuke Imoto</a>, <a href="/search/cs?searchtype=author&query=Fukumori%2C+T">Takahiro Fukumori</a>, <a href="/search/cs?searchtype=author&query=Yamashita%2C+Y">Yoichi Yamashita</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15532v1-abstract-short" style="display: inline;"> Some datasets with the described content and order of occurrence of sounds have been released for conversion between environmental sound and text. However, there are very few texts that include information on the impressions humans feel, such as "sharp" and "gorgeous," when they hear environmental sounds. In this study, we constructed a dataset with impression captions for environmental sounds tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15532v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15532v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15532v1-abstract-full" style="display: none;"> Some datasets with the described content and order of occurrence of sounds have been released for conversion between environmental sound and text. However, there are very few texts that include information on the impressions humans feel, such as "sharp" and "gorgeous," when they hear environmental sounds. In this study, we constructed a dataset with impression captions for environmental sounds that describe the impressions humans have when hearing these sounds. We used ChatGPT to generate impression captions and selected the most appropriate captions for sound by humans. Our dataset consists of 3,600 impression captions for environmental sounds. To evaluate the appropriateness of impression captions for environmental sounds, we conducted subjective and objective evaluations. From our evaluation results, we indicate that appropriate impression captions for environmental sounds can be generated. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15532v1-abstract-full').style.display = 'none'; document.getElementById('2410.15532v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13248">arXiv:2410.13248</a> <span> [<a href="https://arxiv.org/pdf/2410.13248">pdf</a>, <a href="https://arxiv.org/format/2410.13248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Disentangling Likes and Dislikes in Personalized Generative Explainable Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shimizu%2C+R">Ryotaro Shimizu</a>, <a href="/search/cs?searchtype=author&query=Wada%2C+T">Takashi Wada</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Kruse%2C+J">Johannes Kruse</a>, <a href="/search/cs?searchtype=author&query=O%27Brien%2C+S">Sean O'Brien</a>, <a href="/search/cs?searchtype=author&query=HtaungKham%2C+S">Sai HtaungKham</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linxin Song</a>, <a href="/search/cs?searchtype=author&query=Yoshikawa%2C+Y">Yuya Yoshikawa</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Tsung%2C+F">Fugee Tsung</a>, <a href="/search/cs?searchtype=author&query=Goto%2C+M">Masayuki Goto</a>, <a href="/search/cs?searchtype=author&query=McAuley%2C+J">Julian McAuley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13248v1-abstract-short" style="display: inline;"> Recent research on explainable recommendation generally frames the task as a standard text generation problem, and evaluates models simply based on the textual similarity between the predicted and ground-truth explanations. However, this approach fails to consider one crucial aspect of the systems: whether their outputs accurately reflect the users' (post-purchase) sentiments, i.e., whether and wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13248v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13248v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13248v1-abstract-full" style="display: none;"> Recent research on explainable recommendation generally frames the task as a standard text generation problem, and evaluates models simply based on the textual similarity between the predicted and ground-truth explanations. However, this approach fails to consider one crucial aspect of the systems: whether their outputs accurately reflect the users' (post-purchase) sentiments, i.e., whether and why they would like and/or dislike the recommended items. To shed light on this issue, we introduce new datasets and evaluation methods that focus on the users' sentiments. Specifically, we construct the datasets by explicitly extracting users' positive and negative opinions from their post-purchase reviews using an LLM, and propose to evaluate systems based on whether the generated explanations 1) align well with the users' sentiments, and 2) accurately identify both positive and negative opinions of users on the target items. We benchmark several recent models on our datasets and demonstrate that achieving strong performance on existing metrics does not ensure that the generated explanations align well with the users' sentiments. Lastly, we find that existing models can provide more sentiment-aware explanations when the users' (predicted) ratings for the target items are directly fed into the models as input. We will release our code and datasets upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13248v1-abstract-full').style.display = 'none'; document.getElementById('2410.13248v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10992">arXiv:2409.10992</a> <span> [<a href="https://arxiv.org/pdf/2409.10992">pdf</a>, <a href="https://arxiv.org/format/2409.10992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> A Best-of-Both Approach to Improve Match Predictions and Reciprocal Recommendations for Job Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Goda%2C+S">Shuhei Goda</a>, <a href="/search/cs?searchtype=author&query=Hayashi%2C+Y">Yudai Hayashi</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10992v2-abstract-short" style="display: inline;"> Matching users with mutual preferences is a critical aspect of services driven by reciprocal recommendations, such as job search. To produce recommendations in such scenarios, one can predict match probabilities and construct rankings based on these predictions. However, this direct match prediction approach often underperforms due to the extreme sparsity of match labels. Therefore, most existing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10992v2-abstract-full').style.display = 'inline'; document.getElementById('2409.10992v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10992v2-abstract-full" style="display: none;"> Matching users with mutual preferences is a critical aspect of services driven by reciprocal recommendations, such as job search. To produce recommendations in such scenarios, one can predict match probabilities and construct rankings based on these predictions. However, this direct match prediction approach often underperforms due to the extreme sparsity of match labels. Therefore, most existing methods predict preferences separately for each direction (e.g., job seeker to employer and employer to job seeker) and then aggregate the predictions to generate overall matching scores and produce recommendations. However, this typical approach often leads to practical issues, such as biased error propagation between the two models. This paper introduces and demonstrates a novel and practical solution to improve reciprocal recommendations in production by leveraging pseudo-match scores. Specifically, our approach generates dense and more directly relevant pseudo-match scores by combining the true match labels, which are accurate but sparse, with relatively inaccurate but dense match predictions. We then train a meta-model to output the final match predictions by minimizing the prediction loss against the pseudo-match scores. Our method can be seen as a best-of-both (BoB) approach, as it combines the high-level ideas of both direct match prediction and the two separate models approach. It also allows for user-specific weights to construct personalized pseudo-match scores, achieving even better matching performance through appropriate tuning of the weights. Offline experiments on real-world job search data demonstrate the superior performance of our BoB method, particularly with personalized pseudo-match scores, compared to existing approaches in terms of finding potential matches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10992v2-abstract-full').style.display = 'none'; document.getElementById('2409.10992v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09305">arXiv:2409.09305</a> <span> [<a href="https://arxiv.org/pdf/2409.09305">pdf</a>, <a href="https://arxiv.org/format/2409.09305">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The T05 System for The VoiceMOS Challenge 2024: Transfer Learning from Deep Image Classifier to Naturalness MOS Prediction of High-Quality Synthetic Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Baba%2C+K">Kaito Baba</a>, <a href="/search/cs?searchtype=author&query=Nakata%2C+W">Wataru Nakata</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09305v1-abstract-short" style="display: inline;"> We present our system (denoted as T05) for the VoiceMOS Challenge (VMC) 2024. Our system was designed for the VMC 2024 Track 1, which focused on the accurate prediction of naturalness mean opinion score (MOS) for high-quality synthetic speech. In addition to a pretrained self-supervised learning (SSL)-based speech feature extractor, our system incorporates a pretrained image feature extractor to c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09305v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09305v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09305v1-abstract-full" style="display: none;"> We present our system (denoted as T05) for the VoiceMOS Challenge (VMC) 2024. Our system was designed for the VMC 2024 Track 1, which focused on the accurate prediction of naturalness mean opinion score (MOS) for high-quality synthetic speech. In addition to a pretrained self-supervised learning (SSL)-based speech feature extractor, our system incorporates a pretrained image feature extractor to capture the difference of synthetic speech observed in speech spectrograms. We first separately train two MOS predictors that use either of an SSL-based or spectrogram-based feature. Then, we fine-tune the two predictors for better MOS prediction using the fusion of two extracted features. In the VMC 2024 Track 1, our T05 system achieved first place in 7 out of 16 evaluation metrics and second place in the remaining 9 metrics, with a significant difference compared to those ranked third and below. We also report the results of our ablation study to investigate essential factors of our system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09305v1-abstract-full').style.display = 'none'; document.getElementById('2409.09305v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE SLT 2024. Our MOS prediction system (UTMOSv2) is available in https://github.com/sarulab-speech/UTMOSv2</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07265">arXiv:2409.07265</a> <span> [<a href="https://arxiv.org/pdf/2409.07265">pdf</a>, <a href="https://arxiv.org/format/2409.07265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Cross-Dialect Text-To-Speech in Pitch-Accent Language Incorporating Multi-Dialect Phoneme-Level BERT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yamauchi%2C+K">Kazuki Yamauchi</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07265v1-abstract-short" style="display: inline;"> We explore cross-dialect text-to-speech (CD-TTS), a task to synthesize learned speakers' voices in non-native dialects, especially in pitch-accent languages. CD-TTS is important for developing voice agents that naturally communicate with people across regions. We present a novel TTS model comprising three sub-modules to perform competitively at this task. We first train a backbone TTS model to syn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07265v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07265v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07265v1-abstract-full" style="display: none;"> We explore cross-dialect text-to-speech (CD-TTS), a task to synthesize learned speakers' voices in non-native dialects, especially in pitch-accent languages. CD-TTS is important for developing voice agents that naturally communicate with people across regions. We present a novel TTS model comprising three sub-modules to perform competitively at this task. We first train a backbone TTS model to synthesize dialect speech from a text conditioned on phoneme-level accent latent variables (ALVs) extracted from speech by a reference encoder. Then, we train an ALV predictor to predict ALVs tailored to a target dialect from input text leveraging our novel multi-dialect phoneme-level BERT. We conduct multi-dialect TTS experiments and evaluate the effectiveness of our model by comparing it with a baseline derived from conventional dialect TTS methods. The results show that our model improves the dialectal naturalness of synthetic speech in CD-TTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07265v1-abstract-full').style.display = 'none'; document.getElementById('2409.07265v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02599">arXiv:2409.02599</a> <span> [<a href="https://arxiv.org/pdf/2409.02599">pdf</a>, <a href="https://arxiv.org/format/2409.02599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Fashion Item Recommendation Model in Hyperbolic Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shimizu%2C+R">Ryotaro Shimizu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Kimura%2C+M">Masanari Kimura</a>, <a href="/search/cs?searchtype=author&query=Hirakawa%2C+Y">Yuki Hirakawa</a>, <a href="/search/cs?searchtype=author&query=Wada%2C+T">Takashi Wada</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=McAuley%2C+J">Julian McAuley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02599v1-abstract-short" style="display: inline;"> In this work, we propose a fashion item recommendation model that incorporates hyperbolic geometry into user and item representations. Using hyperbolic space, our model aims to capture implicit hierarchies among items based on their visual data and users' purchase history. During training, we apply a multi-task learning framework that considers both hyperbolic and Euclidean distances in the loss f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02599v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02599v1-abstract-full" style="display: none;"> In this work, we propose a fashion item recommendation model that incorporates hyperbolic geometry into user and item representations. Using hyperbolic space, our model aims to capture implicit hierarchies among items based on their visual data and users' purchase history. During training, we apply a multi-task learning framework that considers both hyperbolic and Euclidean distances in the loss function. Our experiments on three data sets show that our model performs better than previous models trained in Euclidean space only, confirming the effectiveness of our model. Our ablation studies show that multi-task learning plays a key role, and removing the Euclidean loss substantially deteriorates the model performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02599v1-abstract-full').style.display = 'none'; document.getElementById('2409.02599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work was presented at the CVFAD Workshop at CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11202">arXiv:2408.11202</a> <span> [<a href="https://arxiv.org/pdf/2408.11202">pdf</a>, <a href="https://arxiv.org/format/2408.11202">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Effective Off-Policy Evaluation and Learning in Contextual Combinatorial Bandits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shimizu%2C+T">Tatsuhiro Shimizu</a>, <a href="/search/cs?searchtype=author&query=Tanaka%2C+K">Koichi Tanaka</a>, <a href="/search/cs?searchtype=author&query=Kishimoto%2C+R">Ren Kishimoto</a>, <a href="/search/cs?searchtype=author&query=Kiyohara%2C+H">Haruka Kiyohara</a>, <a href="/search/cs?searchtype=author&query=Nomura%2C+M">Masahiro Nomura</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11202v1-abstract-short" style="display: inline;"> We explore off-policy evaluation and learning (OPE/L) in contextual combinatorial bandits (CCB), where a policy selects a subset in the action space. For example, it might choose a set of furniture pieces (a bed and a drawer) from available items (bed, drawer, chair, etc.) for interior design sales. This setting is widespread in fields such as recommender systems and healthcare, yet OPE/L of CCB r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11202v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11202v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11202v1-abstract-full" style="display: none;"> We explore off-policy evaluation and learning (OPE/L) in contextual combinatorial bandits (CCB), where a policy selects a subset in the action space. For example, it might choose a set of furniture pieces (a bed and a drawer) from available items (bed, drawer, chair, etc.) for interior design sales. This setting is widespread in fields such as recommender systems and healthcare, yet OPE/L of CCB remains unexplored in the relevant literature. Typical OPE/L methods such as regression and importance sampling can be applied to the CCB problem, however, they face significant challenges due to high bias or variance, exacerbated by the exponential growth in the number of available subsets. To address these challenges, we introduce a concept of factored action space, which allows us to decompose each subset into binary indicators. This formulation allows us to distinguish between the ''main effect'' derived from the main actions, and the ''residual effect'', originating from the supplemental actions, facilitating more effective OPE. Specifically, our estimator, called OPCB, leverages an importance sampling-based approach to unbiasedly estimate the main effect, while employing regression-based approach to deal with the residual effect with low variance. OPCB achieves substantial variance reduction compared to conventional importance sampling methods and bias reduction relative to regression methods under certain conditions, as illustrated in our theoretical analysis. Experiments demonstrate OPCB's superior performance over typical methods in both OPE and OPL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11202v1-abstract-full').style.display = 'none'; document.getElementById('2408.11202v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at RecSys2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10846">arXiv:2408.10846</a> <span> [<a href="https://arxiv.org/pdf/2408.10846">pdf</a>, <a href="https://arxiv.org/format/2408.10846">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Harmonizing Attention: Training-free Texture-aware Geometry Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ikuta%2C+E">Eito Ikuta</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yohan Lee</a>, <a href="/search/cs?searchtype=author&query=Iohara%2C+A">Akihiro Iohara</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yu Saito</a>, <a href="/search/cs?searchtype=author&query=Tanaka%2C+T">Toshiyuki Tanaka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10846v2-abstract-short" style="display: inline;"> Extracting geometry features from photographic images independently of surface texture and transferring them onto different materials remains a complex challenge. In this study, we introduce Harmonizing Attention, a novel training-free approach that leverages diffusion models for texture-aware geometry transfer. Our method employs a simple yet effective modification of self-attention layers, allow… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10846v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10846v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10846v2-abstract-full" style="display: none;"> Extracting geometry features from photographic images independently of surface texture and transferring them onto different materials remains a complex challenge. In this study, we introduce Harmonizing Attention, a novel training-free approach that leverages diffusion models for texture-aware geometry transfer. Our method employs a simple yet effective modification of self-attention layers, allowing the model to query information from multiple reference images within these layers. This mechanism is seamlessly integrated into the inversion process as Texture-aligning Attention and into the generation process as Geometry-aligning Attention. This dual-attention approach ensures the effective capture and transfer of material-independent geometry features while maintaining material-specific textural continuity, all without the need for model fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10846v2-abstract-full').style.display = 'none'; document.getElementById('2408.10846v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at WACV2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15828">arXiv:2407.15828</a> <span> [<a href="https://arxiv.org/pdf/2407.15828">pdf</a>, <a href="https://arxiv.org/format/2407.15828">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> J-CHAT: Japanese Large-scale Spoken Dialogue Corpus for Spoken Dialogue Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nakata%2C+W">Wataru Nakata</a>, <a href="/search/cs?searchtype=author&query=Seki%2C+K">Kentaro Seki</a>, <a href="/search/cs?searchtype=author&query=Yanaka%2C+H">Hitomi Yanaka</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15828v1-abstract-short" style="display: inline;"> Spoken dialogue plays a crucial role in human-AI interactions, necessitating dialogue-oriented spoken language models (SLMs). To develop versatile SLMs, large-scale and diverse speech datasets are essential. Additionally, to ensure hiqh-quality speech generation, the data must be spontaneous like in-wild data and must be acoustically clean with noise removed. Despite the critical need, no open-sou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15828v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15828v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15828v1-abstract-full" style="display: none;"> Spoken dialogue plays a crucial role in human-AI interactions, necessitating dialogue-oriented spoken language models (SLMs). To develop versatile SLMs, large-scale and diverse speech datasets are essential. Additionally, to ensure hiqh-quality speech generation, the data must be spontaneous like in-wild data and must be acoustically clean with noise removed. Despite the critical need, no open-source corpus meeting all these criteria has been available. This study addresses this gap by constructing and releasing a large-scale spoken dialogue corpus, named Japanese Corpus for Human-AI Talks (J-CHAT), which is publicly accessible. Furthermore, this paper presents a language-independent method for corpus construction and describes experiments on dialogue generation using SLMs trained on J-CHAT. Experimental results indicate that the collected data from multiple domains by our method improve the naturalness and meaningfulness of dialogue generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15828v1-abstract-full').style.display = 'none'; document.getElementById('2407.15828v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17722">arXiv:2406.17722</a> <span> [<a href="https://arxiv.org/pdf/2406.17722">pdf</a>, <a href="https://arxiv.org/format/2406.17722">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Spatial Voice Conversion: Voice Conversion Preserving Spatial Information and Non-target Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Seki%2C+K">Kentaro Seki</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Takamune%2C+N">Norihiro Takamune</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Imamura%2C+K">Kanami Imamura</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17722v1-abstract-short" style="display: inline;"> This paper proposes a new task called spatial voice conversion, which aims to convert a target voice while preserving spatial information and non-target signals. Traditional voice conversion methods focus on single-channel waveforms, ignoring the stereo listening experience inherent in human hearing. Our baseline approach addresses this gap by integrating blind source separation (BSS), voice conve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17722v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17722v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17722v1-abstract-full" style="display: none;"> This paper proposes a new task called spatial voice conversion, which aims to convert a target voice while preserving spatial information and non-target signals. Traditional voice conversion methods focus on single-channel waveforms, ignoring the stereo listening experience inherent in human hearing. Our baseline approach addresses this gap by integrating blind source separation (BSS), voice conversion (VC), and spatial mixing to handle multi-channel waveforms. Through experimental evaluations, we organize and identify the key challenges inherent in this task, such as maintaining audio quality and accurately preserving spatial information. Our results highlight the fundamental difficulties in balancing these aspects, providing a benchmark for future research in spatial voice conversion. The proposed method's code is publicly available to encourage further exploration in this domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17722v1-abstract-full').style.display = 'none'; document.getElementById('2406.17722v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07280">arXiv:2406.07280</a> <span> [<a href="https://arxiv.org/pdf/2406.07280">pdf</a>, <a href="https://arxiv.org/ps/2406.07280">ps</a>, <a href="https://arxiv.org/format/2406.07280">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Noise-Robust Voice Conversion by Conditional Denoising Training Using Latent Variables of Recording Quality and Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Igarashi%2C+T">Takuto Igarashi</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Seki%2C+K">Kentaro Seki</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Yamamoto%2C+R">Ryuichi Yamamoto</a>, <a href="/search/cs?searchtype=author&query=Tachibana%2C+K">Kentaro Tachibana</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07280v1-abstract-short" style="display: inline;"> We propose noise-robust voice conversion (VC) which takes into account the recording quality and environment of noisy source speech. Conventional denoising training improves the noise robustness of a VC model by learning noisy-to-clean VC process. However, the naturalness of the converted speech is limited when the noise of the source speech is unseen during the training. To this end, our proposed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07280v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07280v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07280v1-abstract-full" style="display: none;"> We propose noise-robust voice conversion (VC) which takes into account the recording quality and environment of noisy source speech. Conventional denoising training improves the noise robustness of a VC model by learning noisy-to-clean VC process. However, the naturalness of the converted speech is limited when the noise of the source speech is unseen during the training. To this end, our proposed training conditions a VC model on two latent variables representing the recording quality and environment of the source speech. These latent variables are derived from deep neural networks pre-trained on recording quality assessment and acoustic scene classification and calculated in an utterance-wise or frame-wise manner. As a result, the trained VC model can explicitly learn information about speech degradation during the training. Objective and subjective evaluations show that our training improves the quality of the converted speech compared to the conventional training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07280v1-abstract-full').style.display = 'none'; document.getElementById('2406.07280v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, accepted for INTERSPEECH 2024, audio samples: http://y-saito.sakura.ne.jp/sython/Corpus/SRC4VC/IS2024_CDT_supplementary/demo_cdt.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07254">arXiv:2406.07254</a> <span> [<a href="https://arxiv.org/pdf/2406.07254">pdf</a>, <a href="https://arxiv.org/ps/2406.07254">ps</a>, <a href="https://arxiv.org/format/2406.07254">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SRC4VC: Smartphone-Recorded Corpus for Voice Conversion Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Igarashi%2C+T">Takuto Igarashi</a>, <a href="/search/cs?searchtype=author&query=Seki%2C+K">Kentaro Seki</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Yamamoto%2C+R">Ryuichi Yamamoto</a>, <a href="/search/cs?searchtype=author&query=Tachibana%2C+K">Kentaro Tachibana</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07254v1-abstract-short" style="display: inline;"> We present SRC4VC, a new corpus containing 11 hours of speech recorded on smartphones by 100 Japanese speakers. Although high-quality multi-speaker corpora can advance voice conversion (VC) technologies, they are not always suitable for testing VC when low-quality speech recording is given as the input. To this end, we first asked 100 crowdworkers to record their voice samples using smartphones. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07254v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07254v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07254v1-abstract-full" style="display: none;"> We present SRC4VC, a new corpus containing 11 hours of speech recorded on smartphones by 100 Japanese speakers. Although high-quality multi-speaker corpora can advance voice conversion (VC) technologies, they are not always suitable for testing VC when low-quality speech recording is given as the input. To this end, we first asked 100 crowdworkers to record their voice samples using smartphones. Then, we annotated the recorded samples with speaker-wise recording-quality scores and utterance-wise perceived emotion labels. We also benchmark SRC4VC on any-to-any VC, in which we trained a multi-speaker VC model on high-quality speech and used the SRC4VC speakers' voice samples as the source in VC. The results show that the recording quality mismatch between the training and evaluation data significantly degrades the VC performance, which can be improved by applying speech enhancement to the low-quality source speech samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07254v1-abstract-full').style.display = 'none'; document.getElementById('2406.07254v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for INTERSPEECH 2024, corpus project page: https://y-saito.sakura.ne.jp/sython/Corpus/SRC4VC/index.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14522">arXiv:2405.14522</a> <span> [<a href="https://arxiv.org/pdf/2405.14522">pdf</a>, <a href="https://arxiv.org/format/2405.14522">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Explaining Black-box Model Predictions via Two-level Nested Feature Attributions with Consistency Property </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yoshikawa%2C+Y">Yuya Yoshikawa</a>, <a href="/search/cs?searchtype=author&query=Kimura%2C+M">Masanari Kimura</a>, <a href="/search/cs?searchtype=author&query=Shimizu%2C+R">Ryotaro Shimizu</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14522v1-abstract-short" style="display: inline;"> Techniques that explain the predictions of black-box machine learning models are crucial to make the models transparent, thereby increasing trust in AI systems. The input features to the models often have a nested structure that consists of high- and low-level features, and each high-level feature is decomposed into multiple low-level features. For such inputs, both high-level feature attributions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14522v1-abstract-full').style.display = 'inline'; document.getElementById('2405.14522v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14522v1-abstract-full" style="display: none;"> Techniques that explain the predictions of black-box machine learning models are crucial to make the models transparent, thereby increasing trust in AI systems. The input features to the models often have a nested structure that consists of high- and low-level features, and each high-level feature is decomposed into multiple low-level features. For such inputs, both high-level feature attributions (HiFAs) and low-level feature attributions (LoFAs) are important for better understanding the model's decision. In this paper, we propose a model-agnostic local explanation method that effectively exploits the nested structure of the input to estimate the two-level feature attributions simultaneously. A key idea of the proposed method is to introduce the consistency property that should exist between the HiFAs and LoFAs, thereby bridging the separate optimization problems for estimating them. Thanks to this consistency property, the proposed method can produce HiFAs and LoFAs that are both faithful to the black-box models and consistent with each other, using a smaller number of queries to the models. In experiments on image classification in multiple instance learning and text classification using language models, we demonstrate that the HiFAs and LoFAs estimated by the proposed method are accurate, faithful to the behaviors of the black-box models, and provide consistent explanations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14522v1-abstract-full').style.display = 'none'; document.getElementById('2405.14522v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15691">arXiv:2404.15691</a> <span> [<a href="https://arxiv.org/pdf/2404.15691">pdf</a>, <a href="https://arxiv.org/format/2404.15691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Long-term Off-Policy Evaluation and Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a>, <a href="/search/cs?searchtype=author&query=Abdollahpouri%2C+H">Himan Abdollahpouri</a>, <a href="/search/cs?searchtype=author&query=Anderton%2C+J">Jesse Anderton</a>, <a href="/search/cs?searchtype=author&query=Carterette%2C+B">Ben Carterette</a>, <a href="/search/cs?searchtype=author&query=Lalmas%2C+M">Mounia Lalmas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15691v1-abstract-short" style="display: inline;"> Short- and long-term outcomes of an algorithm often differ, with damaging downstream effects. A known example is a click-bait algorithm, which may increase short-term clicks but damage long-term user engagement. A possible solution to estimate the long-term outcome is to run an online experiment or A/B test for the potential algorithms, but it takes months or even longer to observe the long-term o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15691v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15691v1-abstract-full" style="display: none;"> Short- and long-term outcomes of an algorithm often differ, with damaging downstream effects. A known example is a click-bait algorithm, which may increase short-term clicks but damage long-term user engagement. A possible solution to estimate the long-term outcome is to run an online experiment or A/B test for the potential algorithms, but it takes months or even longer to observe the long-term outcomes of interest, making the algorithm selection process unacceptably slow. This work thus studies the problem of feasibly yet accurately estimating the long-term outcome of an algorithm using only historical and short-term experiment data. Existing approaches to this problem either need a restrictive assumption about the short-term outcomes called surrogacy or cannot effectively use short-term outcomes, which is inefficient. Therefore, we propose a new framework called Long-term Off-Policy Evaluation (LOPE), which is based on reward function decomposition. LOPE works under a more relaxed assumption than surrogacy and effectively leverages short-term rewards to substantially reduce the variance. Synthetic experiments show that LOPE outperforms existing approaches particularly when surrogacy is severely violated and the long-term reward is noisy. In addition, real-world experiments on large-scale A/B test data collected on a music streaming platform show that LOPE can estimate the long-term outcome of actual algorithms more accurately than existing feasible methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15691v1-abstract-full').style.display = 'none'; document.getElementById('2404.15691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">TheWebConference 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15084">arXiv:2404.15084</a> <span> [<a href="https://arxiv.org/pdf/2404.15084">pdf</a>, <a href="https://arxiv.org/format/2404.15084">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Hyperparameter Optimization Can Even be Harmful in Off-Policy Learning and How to Deal with It </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a>, <a href="/search/cs?searchtype=author&query=Nomura%2C+M">Masahiro Nomura</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15084v1-abstract-short" style="display: inline;"> There has been a growing interest in off-policy evaluation in the literature such as recommender systems and personalized medicine. We have so far seen significant progress in developing estimators aimed at accurately estimating the effectiveness of counterfactual policies based on biased logged data. However, there are many cases where those estimators are used not only to evaluate the value of d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15084v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15084v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15084v1-abstract-full" style="display: none;"> There has been a growing interest in off-policy evaluation in the literature such as recommender systems and personalized medicine. We have so far seen significant progress in developing estimators aimed at accurately estimating the effectiveness of counterfactual policies based on biased logged data. However, there are many cases where those estimators are used not only to evaluate the value of decision making policies but also to search for the best hyperparameters from a large candidate space. This work explores the latter hyperparameter optimization (HPO) task for off-policy learning. We empirically show that naively applying an unbiased estimator of the generalization performance as a surrogate objective in HPO can cause an unexpected failure, merely pursuing hyperparameters whose generalization performance is greatly overestimated. We then propose simple and computationally efficient corrections to the typical HPO procedure to deal with the aforementioned issues simultaneously. Empirical investigations demonstrate the effectiveness of our proposed HPO algorithm in situations where the typical procedure fails severely. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15084v1-abstract-full').style.display = 'none'; document.getElementById('2404.15084v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IJCAI'24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17410">arXiv:2403.17410</a> <span> [<a href="https://arxiv.org/pdf/2403.17410">pdf</a>, <a href="https://arxiv.org/format/2403.17410">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On permutation-invariant neural networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kimura%2C+M">Masanari Kimura</a>, <a href="/search/cs?searchtype=author&query=Shimizu%2C+R">Ryotaro Shimizu</a>, <a href="/search/cs?searchtype=author&query=Hirakawa%2C+Y">Yuki Hirakawa</a>, <a href="/search/cs?searchtype=author&query=Goto%2C+R">Ryosuke Goto</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17410v2-abstract-short" style="display: inline;"> Conventional machine learning algorithms have traditionally been designed under the assumption that input data follows a vector-based format, with an emphasis on vector-centric paradigms. However, as the demand for tasks involving set-based inputs has grown, there has been a paradigm shift in the research community towards addressing these challenges. In recent years, the emergence of neural netwo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17410v2-abstract-full').style.display = 'inline'; document.getElementById('2403.17410v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17410v2-abstract-full" style="display: none;"> Conventional machine learning algorithms have traditionally been designed under the assumption that input data follows a vector-based format, with an emphasis on vector-centric paradigms. However, as the demand for tasks involving set-based inputs has grown, there has been a paradigm shift in the research community towards addressing these challenges. In recent years, the emergence of neural network architectures such as Deep Sets and Transformers has presented a significant advancement in the treatment of set-based data. These architectures are specifically engineered to naturally accommodate sets as input, enabling more effective representation and processing of set structures. Consequently, there has been a surge of research endeavors dedicated to exploring and harnessing the capabilities of these architectures for various tasks involving the approximation of set functions. This comprehensive survey aims to provide an overview of the diverse problem settings and ongoing research efforts pertaining to neural networks that approximate set functions. By delving into the intricacies of these approaches and elucidating the associated challenges, the survey aims to equip readers with a comprehensive understanding of the field. Through this comprehensive perspective, we hope that researchers can gain valuable insights into the potential applications, inherent limitations, and future directions of set-based neural networks. Indeed, from this survey we gain two insights: i) Deep Sets and its variants can be generalized by differences in the aggregation function, and ii) the behavior of Deep Sets is sensitive to the choice of the aggregation function. From these observations, we show that Deep Sets, one of the well-known permutation-invariant neural networks, can be generalized in the sense of a quasi-arithmetic mean. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17410v2-abstract-full').style.display = 'none'; document.getElementById('2403.17410v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13720">arXiv:2403.13720</a> <span> [<a href="https://arxiv.org/pdf/2403.13720">pdf</a>, <a href="https://arxiv.org/format/2403.13720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> UTDUSS: UTokyo-SaruLab System for Interspeech2024 Speech Processing Using Discrete Speech Unit Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nakata%2C+W">Wataru Nakata</a>, <a href="/search/cs?searchtype=author&query=Yamauchi%2C+K">Kazuki Yamauchi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dong Yang</a>, <a href="/search/cs?searchtype=author&query=Hyodo%2C+H">Hiroaki Hyodo</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13720v1-abstract-short" style="display: inline;"> We present UTDUSS, the UTokyo-SaruLab system submitted to Interspeech2024 Speech Processing Using Discrete Speech Unit Challenge. The challenge focuses on using discrete speech unit learned from large speech corpora for some tasks. We submitted our UTDUSS system to two text-to-speech tracks: Vocoder and Acoustic+Vocoder. Our system incorporates neural audio codec (NAC) pre-trained on only speech c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13720v1-abstract-full').style.display = 'inline'; document.getElementById('2403.13720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13720v1-abstract-full" style="display: none;"> We present UTDUSS, the UTokyo-SaruLab system submitted to Interspeech2024 Speech Processing Using Discrete Speech Unit Challenge. The challenge focuses on using discrete speech unit learned from large speech corpora for some tasks. We submitted our UTDUSS system to two text-to-speech tracks: Vocoder and Acoustic+Vocoder. Our system incorporates neural audio codec (NAC) pre-trained on only speech corpora, which makes the learned codec represent rich acoustic features that are necessary for high-fidelity speech reconstruction. For the acoustic+vocoder track, we trained an acoustic model based on Transformer encoder-decoder that predicted the pre-trained NAC tokens from text input. We describe our strategies to build these models, such as data selection, downsampling, and hyper-parameter tuning. Our system ranked in second and first for the Vocoder and Acoustic+Vocoder tracks, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13720v1-abstract-full').style.display = 'none'; document.getElementById('2403.13720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13353">arXiv:2403.13353</a> <span> [<a href="https://arxiv.org/pdf/2403.13353">pdf</a>, <a href="https://arxiv.org/format/2403.13353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Building speech corpus with diverse voice characteristics for its prompt-based representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Watanabe%2C+A">Aya Watanabe</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Nakata%2C+W">Wataru Nakata</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+D">Detai Xin</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13353v1-abstract-short" style="display: inline;"> In text-to-speech synthesis, the ability to control voice characteristics is vital for various applications. By leveraging thriving text prompt-based generation techniques, it should be possible to enhance the nuanced control of voice characteristics. While previous research has explored the prompt-based manipulation of voice characteristics, most studies have used pre-recorded speech, which limit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13353v1-abstract-full').style.display = 'inline'; document.getElementById('2403.13353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13353v1-abstract-full" style="display: none;"> In text-to-speech synthesis, the ability to control voice characteristics is vital for various applications. By leveraging thriving text prompt-based generation techniques, it should be possible to enhance the nuanced control of voice characteristics. While previous research has explored the prompt-based manipulation of voice characteristics, most studies have used pre-recorded speech, which limits the diversity of voice characteristics available. Thus, we aim to address this gap by creating a novel corpus and developing a model for prompt-based manipulation of voice characteristics in text-to-speech synthesis, facilitating a broader range of voice characteristics. Specifically, we propose a method to build a sizable corpus pairing voice characteristics descriptions with corresponding speech samples. This involves automatically gathering voice-related speech data from the Internet, ensuring its quality, and manually annotating it using crowdsourcing. We implement this method with Japanese language data and analyze the results to validate its effectiveness. Subsequently, we propose a construction method of the model to retrieve speech from voice characteristics descriptions based on a contrastive learning method. We train the model using not only conservative contrastive learning but also feature prediction learning to predict quantitative speech features corresponding to voice characteristics. We evaluate the model performance via experiments with the corpus we constructed above. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13353v1-abstract-full').style.display = 'none'; document.getElementById('2403.13353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE/ACM Transactions on Audio, Speech, and Language Processing. arXiv admin note: text overlap with arXiv:2309.13509</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.14369">arXiv:2402.14369</a> <span> [<a href="https://arxiv.org/pdf/2402.14369">pdf</a>, <a href="https://arxiv.org/format/2402.14369">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3589334.3645390">10.1145/3589334.3645390 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Scalable and Provably Fair Exposure Control for Large-Scale Recommender Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Togashi%2C+R">Riku Togashi</a>, <a href="/search/cs?searchtype=author&query=Abe%2C+K">Kenshi Abe</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.14369v1-abstract-short" style="display: inline;"> Typical recommendation and ranking methods aim to optimize the satisfaction of users, but they are often oblivious to their impact on the items (e.g., products, jobs, news, video) and their providers. However, there has been a growing understanding that the latter is crucial to consider for a wide range of applications, since it determines the utility of those being recommended. Prior approaches t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.14369v1-abstract-full').style.display = 'inline'; document.getElementById('2402.14369v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.14369v1-abstract-full" style="display: none;"> Typical recommendation and ranking methods aim to optimize the satisfaction of users, but they are often oblivious to their impact on the items (e.g., products, jobs, news, video) and their providers. However, there has been a growing understanding that the latter is crucial to consider for a wide range of applications, since it determines the utility of those being recommended. Prior approaches to fairness-aware recommendation optimize a regularized objective to balance user satisfaction and item fairness based on some notion such as exposure fairness. These existing methods have been shown to be effective in controlling fairness, however, most of them are computationally inefficient, limiting their applications to only unrealistically small-scale situations. This indeed implies that the literature does not yet provide a solution to enable a flexible control of exposure in the industry-scale recommender systems where millions of users and items exist. To enable a computationally efficient exposure control even for such large-scale systems, this work develops a scalable, fast, and fair method called \emph{\textbf{ex}posure-aware \textbf{ADMM} (\textbf{exADMM})}. exADMM is based on implicit alternating least squares (iALS), a conventional scalable algorithm for collaborative filtering, but optimizes a regularized objective to achieve a flexible control of accuracy-fairness tradeoff. A particular technical challenge in developing exADMM is the fact that the fairness regularizer destroys the separability of optimization subproblems for users and items, which is an essential property to ensure the scalability of iALS. Therefore, we develop a set of optimization tools to enable yet scalable fairness control with provable convergence guarantees as a basis of our algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.14369v1-abstract-full').style.display = 'none'; document.getElementById('2402.14369v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at WWW2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.06151">arXiv:2402.06151</a> <span> [<a href="https://arxiv.org/pdf/2402.06151">pdf</a>, <a href="https://arxiv.org/format/2402.06151">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> POTEC: Off-Policy Learning for Large Action Spaces via Two-Stage Policy Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jihan Yao</a>, <a href="/search/cs?searchtype=author&query=Joachims%2C+T">Thorsten Joachims</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.06151v1-abstract-short" style="display: inline;"> We study off-policy learning (OPL) of contextual bandit policies in large discrete action spaces where existing methods -- most of which rely crucially on reward-regression models or importance-weighted policy gradients -- fail due to excessive bias or variance. To overcome these issues in OPL, we propose a novel two-stage algorithm, called Policy Optimization via Two-Stage Policy Decomposition (P… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06151v1-abstract-full').style.display = 'inline'; document.getElementById('2402.06151v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.06151v1-abstract-full" style="display: none;"> We study off-policy learning (OPL) of contextual bandit policies in large discrete action spaces where existing methods -- most of which rely crucially on reward-regression models or importance-weighted policy gradients -- fail due to excessive bias or variance. To overcome these issues in OPL, we propose a novel two-stage algorithm, called Policy Optimization via Two-Stage Policy Decomposition (POTEC). It leverages clustering in the action space and learns two different policies via policy- and regression-based approaches, respectively. In particular, we derive a novel low-variance gradient estimator that enables to learn a first-stage policy for cluster selection efficiently via a policy-based approach. To select a specific action within the cluster sampled by the first-stage policy, POTEC uses a second-stage policy derived from a regression-based approach within each cluster. We show that a local correctness condition, which only requires that the regression model preserves the relative expected reward differences of the actions within each cluster, ensures that our policy-gradient estimator is unbiased and the second-stage policy is optimal. We also show that POTEC provides a strict generalization of policy- and regression-based approaches and their associated assumptions. Comprehensive experiments demonstrate that POTEC provides substantial improvements in OPL effectiveness particularly in large and structured action spaces. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06151v1-abstract-full').style.display = 'none'; document.getElementById('2402.06151v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2305.08062</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.02171">arXiv:2402.02171</a> <span> [<a href="https://arxiv.org/pdf/2402.02171">pdf</a>, <a href="https://arxiv.org/format/2402.02171">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3589334.3645343">10.1145/3589334.3645343 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Off-Policy Evaluation of Slate Bandit Policies via Optimizing Abstraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kiyohara%2C+H">Haruka Kiyohara</a>, <a href="/search/cs?searchtype=author&query=Nomura%2C+M">Masahiro Nomura</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.02171v2-abstract-short" style="display: inline;"> We study off-policy evaluation (OPE) in the problem of slate contextual bandits where a policy selects multi-dimensional actions known as slates. This problem is widespread in recommender systems, search engines, marketing, to medical applications, however, the typical Inverse Propensity Scoring (IPS) estimator suffers from substantial variance due to large action spaces, making effective OPE a si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02171v2-abstract-full').style.display = 'inline'; document.getElementById('2402.02171v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.02171v2-abstract-full" style="display: none;"> We study off-policy evaluation (OPE) in the problem of slate contextual bandits where a policy selects multi-dimensional actions known as slates. This problem is widespread in recommender systems, search engines, marketing, to medical applications, however, the typical Inverse Propensity Scoring (IPS) estimator suffers from substantial variance due to large action spaces, making effective OPE a significant challenge. The PseudoInverse (PI) estimator has been introduced to mitigate the variance issue by assuming linearity in the reward function, but this can result in significant bias as this assumption is hard-to-verify from observed data and is often substantially violated. To address the limitations of previous estimators, we develop a novel estimator for OPE of slate bandits, called Latent IPS (LIPS), which defines importance weights in a low-dimensional slate abstraction space where we optimize slate abstractions to minimize the bias and variance of LIPS in a data-driven way. By doing so, LIPS can substantially reduce the variance of IPS without imposing restrictive assumptions on the reward function structure like linearity. Through empirical evaluation, we demonstrate that LIPS substantially outperforms existing estimators, particularly in scenarios with non-linear rewards and large slate spaces. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02171v2-abstract-full').style.display = 'none'; document.getElementById('2402.02171v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WWW2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00288">arXiv:2402.00288</a> <span> [<a href="https://arxiv.org/pdf/2402.00288">pdf</a>, <a href="https://arxiv.org/format/2402.00288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Frame-Wise Breath Detection with Self-Training: An Exploration of Enhancing Breath Naturalness in Text-to-Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dong Yang</a>, <a href="/search/cs?searchtype=author&query=Koriyama%2C+T">Tomoki Koriyama</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00288v2-abstract-short" style="display: inline;"> Developing Text-to-Speech (TTS) systems that can synthesize natural breath is essential for human-like voice agents but requires extensive manual annotation of breath positions in training data. To this end, we propose a self-training method for training a breath detection model that can automatically detect breath positions in speech. Our method trains the model using a large speech corpus and in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00288v2-abstract-full').style.display = 'inline'; document.getElementById('2402.00288v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00288v2-abstract-full" style="display: none;"> Developing Text-to-Speech (TTS) systems that can synthesize natural breath is essential for human-like voice agents but requires extensive manual annotation of breath positions in training data. To this end, we propose a self-training method for training a breath detection model that can automatically detect breath positions in speech. Our method trains the model using a large speech corpus and involves: 1) annotation of limited breath sounds utilizing a rule-based approach, and 2) iterative augmentation of these annotations through pseudo-labeling based on the model's predictions. Our detection model employs Conformer blocks with down-/up-sampling layers, enabling accurate frame-wise breath detection. We investigate its effectiveness in multi-speaker TTS using text transcripts with detected breath marks. The results indicate that using our proposed model for breath detection and breath mark insertion synthesizes breath-contained speech more naturally than a baseline model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00288v2-abstract-full').style.display = 'none'; document.getElementById('2402.00288v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.18207">arXiv:2311.18207</a> <span> [<a href="https://arxiv.org/pdf/2311.18207">pdf</a>, <a href="https://arxiv.org/format/2311.18207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Assessing and Benchmarking Risk-Return Tradeoff of Off-Policy Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kiyohara%2C+H">Haruka Kiyohara</a>, <a href="/search/cs?searchtype=author&query=Kishimoto%2C+R">Ren Kishimoto</a>, <a href="/search/cs?searchtype=author&query=Kawakami%2C+K">Kosuke Kawakami</a>, <a href="/search/cs?searchtype=author&query=Kobayashi%2C+K">Ken Kobayashi</a>, <a href="/search/cs?searchtype=author&query=Nakata%2C+K">Kazuhide Nakata</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18207v3-abstract-short" style="display: inline;"> Off-Policy Evaluation (OPE) aims to assess the effectiveness of counterfactual policies using only offline logged data and is often used to identify the top-k promising policies for deployment in online A/B tests. Existing evaluation metrics for OPE estimators primarily focus on the "accuracy" of OPE or that of downstream policy selection, neglecting risk-return tradeoff in the subsequent online p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18207v3-abstract-full').style.display = 'inline'; document.getElementById('2311.18207v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18207v3-abstract-full" style="display: none;"> Off-Policy Evaluation (OPE) aims to assess the effectiveness of counterfactual policies using only offline logged data and is often used to identify the top-k promising policies for deployment in online A/B tests. Existing evaluation metrics for OPE estimators primarily focus on the "accuracy" of OPE or that of downstream policy selection, neglecting risk-return tradeoff in the subsequent online policy deployment. To address this issue, we draw inspiration from portfolio evaluation in finance and develop a new metric, called SharpeRatio@k, which measures the risk-return tradeoff of policy portfolios formed by an OPE estimator under varying online evaluation budgets (k). We validate our metric in two example scenarios, demonstrating its ability to effectively distinguish between low-risk and high-risk estimators and to accurately identify the most efficient one. Efficiency of an estimator is characterized by its capability to form the most advantageous policy portfolios, maximizing returns while minimizing risks during online deployment, a nuance that existing metrics typically overlook. To facilitate a quick, accurate, and consistent evaluation of OPE via SharpeRatio@k, we have also integrated this metric into an open-source software, SCOPE-RL (https://github.com/hakuhodo-technologies/scope-rl). Employing SharpeRatio@k and SCOPE-RL, we conduct comprehensive benchmarking experiments on various estimators and RL tasks, focusing on their risk-return tradeoff. These experiments offer several interesting directions and suggestions for future OPE research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18207v3-abstract-full').style.display = 'none'; document.getElementById('2311.18207v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.18206">arXiv:2311.18206</a> <span> [<a href="https://arxiv.org/pdf/2311.18206">pdf</a>, <a href="https://arxiv.org/format/2311.18206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SCOPE-RL: A Python Library for Offline Reinforcement Learning and Off-Policy Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kiyohara%2C+H">Haruka Kiyohara</a>, <a href="/search/cs?searchtype=author&query=Kishimoto%2C+R">Ren Kishimoto</a>, <a href="/search/cs?searchtype=author&query=Kawakami%2C+K">Kosuke Kawakami</a>, <a href="/search/cs?searchtype=author&query=Kobayashi%2C+K">Ken Kobayashi</a>, <a href="/search/cs?searchtype=author&query=Nakata%2C+K">Kazuhide Nakata</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18206v3-abstract-short" style="display: inline;"> This paper introduces SCOPE-RL, a comprehensive open-source Python software designed for offline reinforcement learning (offline RL), off-policy evaluation (OPE), and selection (OPS). Unlike most existing libraries that focus solely on either policy learning or evaluation, SCOPE-RL seamlessly integrates these two key aspects, facilitating flexible and complete implementations of both offline RL an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18206v3-abstract-full').style.display = 'inline'; document.getElementById('2311.18206v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18206v3-abstract-full" style="display: none;"> This paper introduces SCOPE-RL, a comprehensive open-source Python software designed for offline reinforcement learning (offline RL), off-policy evaluation (OPE), and selection (OPS). Unlike most existing libraries that focus solely on either policy learning or evaluation, SCOPE-RL seamlessly integrates these two key aspects, facilitating flexible and complete implementations of both offline RL and OPE processes. SCOPE-RL put particular emphasis on its OPE modules, offering a range of OPE estimators and robust evaluation-of-OPE protocols. This approach enables more in-depth and reliable OPE compared to other packages. For instance, SCOPE-RL enhances OPE by estimating the entire reward distribution under a policy rather than its mere point-wise expected value. Additionally, SCOPE-RL provides a more thorough evaluation-of-OPE by presenting the risk-return tradeoff in OPE results, extending beyond mere accuracy evaluations in existing OPE literature. SCOPE-RL is designed with user accessibility in mind. Its user-friendly APIs, comprehensive documentation, and a variety of easy-to-follow examples assist researchers and practitioners in efficiently implementing and experimenting with various offline RL methods and OPE estimators, tailored to their specific problem contexts. The documentation of SCOPE-RL is available at https://scope-rl.readthedocs.io/en/latest/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18206v3-abstract-full').style.display = 'none'; document.getElementById('2311.18206v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint, open-source software: https://github.com/hakuhodo-technologies/scope-rl</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.16630">arXiv:2311.16630</a> <span> [<a href="https://arxiv.org/pdf/2311.16630">pdf</a>, <a href="https://arxiv.org/format/2311.16630">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Outfit Completion via Conditional Set Transformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nakamura%2C+T">Takuma Nakamura</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Goto%2C+R">Ryosuke Goto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.16630v1-abstract-short" style="display: inline;"> In this paper, we formulate the outfit completion problem as a set retrieval task and propose a novel framework for solving this problem. The proposal includes a conditional set transformation architecture with deep neural networks and a compatibility-based regularization method. The proposed method utilizes a map with permutation-invariant for the input set and permutation-equivariant for the con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16630v1-abstract-full').style.display = 'inline'; document.getElementById('2311.16630v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.16630v1-abstract-full" style="display: none;"> In this paper, we formulate the outfit completion problem as a set retrieval task and propose a novel framework for solving this problem. The proposal includes a conditional set transformation architecture with deep neural networks and a compatibility-based regularization method. The proposed method utilizes a map with permutation-invariant for the input set and permutation-equivariant for the condition set. This allows retrieving a set that is compatible with the input set while reflecting the properties of the condition set. In addition, since this structure outputs the element of the output set in a single inference, it can achieve a scalable inference speed with respect to the cardinality of the output set. Experimental results on real data reveal that the proposed method outperforms existing approaches in terms of accuracy of the outfit completion task, condition satisfaction, and compatibility of completion results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16630v1-abstract-full').style.display = 'none'; document.getElementById('2311.16630v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.16509">arXiv:2311.16509</a> <span> [<a href="https://arxiv.org/pdf/2311.16509">pdf</a>, <a href="https://arxiv.org/ps/2311.16509">ps</a>, <a href="https://arxiv.org/format/2311.16509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> StyleCap: Automatic Speaking-Style Captioning from Speech Based on Speech and Language Self-supervised Learning Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yamauchi%2C+K">Kazuki Yamauchi</a>, <a href="/search/cs?searchtype=author&query=Ijima%2C+Y">Yusuke Ijima</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.16509v2-abstract-short" style="display: inline;"> We propose StyleCap, a method to generate natural language descriptions of speaking styles appearing in speech. Although most of conventional techniques for para-/non-linguistic information recognition focus on the category classification or the intensity estimation of pre-defined labels, they cannot provide the reasoning of the recognition result in an interpretable manner. StyleCap is a first st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16509v2-abstract-full').style.display = 'inline'; document.getElementById('2311.16509v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.16509v2-abstract-full" style="display: none;"> We propose StyleCap, a method to generate natural language descriptions of speaking styles appearing in speech. Although most of conventional techniques for para-/non-linguistic information recognition focus on the category classification or the intensity estimation of pre-defined labels, they cannot provide the reasoning of the recognition result in an interpretable manner. StyleCap is a first step towards an end-to-end method for generating speaking-style prompts from speech, i.e., automatic speaking-style captioning. StyleCap is trained with paired data of speech and natural language descriptions. We train neural networks that convert a speech representation vector into prefix vectors that are fed into a large language model (LLM)-based text decoder. We explore an appropriate text decoder and speech feature representation suitable for this new task. The experimental results demonstrate that our StyleCap leveraging richer LLMs for the text decoder, speech self-supervised learning (SSL) features, and sentence rephrasing augmentation improves the accuracy and diversity of generated speaking-style captions. Samples of speaking-style captions generated by our StyleCap are publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16509v2-abstract-full').style.display = 'none'; document.getElementById('2311.16509v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for ICASSP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.14890">arXiv:2310.14890</a> <span> [<a href="https://arxiv.org/pdf/2310.14890">pdf</a>, <a href="https://arxiv.org/format/2310.14890">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Boosting for Bounding the Worst-class Error </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuya Saito</a>, <a href="/search/cs?searchtype=author&query=Matsuo%2C+S">Shinnosuke Matsuo</a>, <a href="/search/cs?searchtype=author&query=Uchida%2C+S">Seiichi Uchida</a>, <a href="/search/cs?searchtype=author&query=Suehiro%2C+D">Daiki Suehiro</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.14890v1-abstract-short" style="display: inline;"> This paper tackles the problem of the worst-class error rate, instead of the standard error rate averaged over all classes. For example, a three-class classification task with class-wise error rates of 10\%, 10\%, and 40\% has a worst-class error rate of 40\%, whereas the average is 20\% under the class-balanced condition. The worst-class error is important in many applications. For example, in a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.14890v1-abstract-full').style.display = 'inline'; document.getElementById('2310.14890v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.14890v1-abstract-full" style="display: none;"> This paper tackles the problem of the worst-class error rate, instead of the standard error rate averaged over all classes. For example, a three-class classification task with class-wise error rates of 10\%, 10\%, and 40\% has a worst-class error rate of 40\%, whereas the average is 20\% under the class-balanced condition. The worst-class error is important in many applications. For example, in a medical image classification task, it would not be acceptable for the malignant tumor class to have a 40\% error rate, while the benign and healthy classes have 10\% error rates.We propose a boosting algorithm that guarantees an upper bound of the worst-class training error and derive its generalization bound. Experimental results show that the algorithm lowers worst-class test error rates while avoiding overfitting to the training set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.14890v1-abstract-full').style.display = 'none'; document.getElementById('2310.14890v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.06072">arXiv:2310.06072</a> <span> [<a href="https://arxiv.org/pdf/2310.06072">pdf</a>, <a href="https://arxiv.org/format/2310.06072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ACCESS.2024.3360885">10.1109/ACCESS.2024.3360885 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> JVNV: A Corpus of Japanese Emotional Speech with Verbal Content and Nonverbal Expressions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xin%2C+D">Detai Xin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Junfeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Aizawa%2C+A">Akiko Aizawa</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.06072v1-abstract-short" style="display: inline;"> We present the JVNV, a Japanese emotional speech corpus with verbal content and nonverbal vocalizations whose scripts are generated by a large-scale language model. Existing emotional speech corpora lack not only proper emotional scripts but also nonverbal vocalizations (NVs) that are essential expressions in spoken language to express emotions. We propose an automatic script generation method to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06072v1-abstract-full').style.display = 'inline'; document.getElementById('2310.06072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.06072v1-abstract-full" style="display: none;"> We present the JVNV, a Japanese emotional speech corpus with verbal content and nonverbal vocalizations whose scripts are generated by a large-scale language model. Existing emotional speech corpora lack not only proper emotional scripts but also nonverbal vocalizations (NVs) that are essential expressions in spoken language to express emotions. We propose an automatic script generation method to produce emotional scripts by providing seed words with sentiment polarity and phrases of nonverbal vocalizations to ChatGPT using prompt engineering. We select 514 scripts with balanced phoneme coverage from the generated candidate scripts with the assistance of emotion confidence scores and language fluency scores. We demonstrate the effectiveness of JVNV by showing that JVNV has better phoneme coverage and emotion recognizability than previous Japanese emotional speech corpora. We then benchmark JVNV on emotional text-to-speech synthesis using discrete codes to represent NVs. We show that there still exists a gap between the performance of synthesizing read-aloud speech and emotional speech, and adding NVs in the speech makes the task even harder, which brings new challenges for this task and makes JVNV a valuable resource for relevant works in the future. To our best knowledge, JVNV is the first speech corpus that generates scripts automatically using large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06072v1-abstract-full').style.display = 'none'; document.getElementById('2310.06072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.13509">arXiv:2309.13509</a> <span> [<a href="https://arxiv.org/pdf/2309.13509">pdf</a>, <a href="https://arxiv.org/format/2309.13509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Coco-Nut: Corpus of Japanese Utterance and Voice Characteristics Description for Prompt-based Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Watanabe%2C+A">Aya Watanabe</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Nakata%2C+W">Wataru Nakata</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+D">Detai Xin</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.13509v1-abstract-short" style="display: inline;"> In text-to-speech, controlling voice characteristics is important in achieving various-purpose speech synthesis. Considering the success of text-conditioned generation, such as text-to-image, free-form text instruction should be useful for intuitive and complicated control of voice characteristics. A sufficiently large corpus of high-quality and diverse voice samples with corresponding free-form d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13509v1-abstract-full').style.display = 'inline'; document.getElementById('2309.13509v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.13509v1-abstract-full" style="display: none;"> In text-to-speech, controlling voice characteristics is important in achieving various-purpose speech synthesis. Considering the success of text-conditioned generation, such as text-to-image, free-form text instruction should be useful for intuitive and complicated control of voice characteristics. A sufficiently large corpus of high-quality and diverse voice samples with corresponding free-form descriptions can advance such control research. However, neither an open corpus nor a scalable method is currently available. To this end, we develop Coco-Nut, a new corpus including diverse Japanese utterances, along with text transcriptions and free-form voice characteristics descriptions. Our methodology to construct this corpus consists of 1) automatic collection of voice-related audio data from the Internet, 2) quality assurance, and 3) manual annotation using crowdsourcing. Additionally, we benchmark our corpus on the prompt embedding model trained by contrastive speech-text learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13509v1-abstract-full').style.display = 'none'; document.getElementById('2309.13509v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ASRU2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.08785">arXiv:2308.08785</a> <span> [<a href="https://arxiv.org/pdf/2308.08785">pdf</a>, <a href="https://arxiv.org/format/2308.08785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> A Feasibility-Preserved Quantum Approximate Solver for the Capacitated Vehicle Routing Problem </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+N">Ningyi Xie</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+X">Xinwei Lee</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Dongsheng Cai</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yoshiyuki Saito</a>, <a href="/search/cs?searchtype=author&query=Asai%2C+N">Nobuyoshi Asai</a>, <a href="/search/cs?searchtype=author&query=Lau%2C+H+C">Hoong Chuin Lau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.08785v3-abstract-short" style="display: inline;"> The Capacitated Vehicle Routing Problem (CVRP) is an NP-optimization problem (NPO) that arises in various fields including transportation and logistics. The CVRP extends from the Vehicle Routing Problem (VRP), aiming to determine the most efficient plan for a fleet of vehicles to deliver goods to a set of customers, subject to the limited carrying capacity of each vehicle. As the number of possibl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.08785v3-abstract-full').style.display = 'inline'; document.getElementById('2308.08785v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.08785v3-abstract-full" style="display: none;"> The Capacitated Vehicle Routing Problem (CVRP) is an NP-optimization problem (NPO) that arises in various fields including transportation and logistics. The CVRP extends from the Vehicle Routing Problem (VRP), aiming to determine the most efficient plan for a fleet of vehicles to deliver goods to a set of customers, subject to the limited carrying capacity of each vehicle. As the number of possible solutions skyrockets when the number of customers increases, finding the optimal solution remains a significant challenge. Recently, the Quantum Approximate Optimization Algorithm (QAOA), a quantum-classical hybrid algorithm, has exhibited enhanced performance in certain combinatorial optimization problems compared to classical heuristics. However, its ability diminishes notably in solving constrained optimization problems including the CVRP. This limitation primarily arises from the typical approach of encoding the given problems as penalty-inclusive binary optimization problems. In this case, the QAOA faces challenges in sampling solutions satisfying all constraints. Addressing this, our work presents a new binary encoding for the CVRP, with an alternative objective function of minimizing the shortest path that bypasses the vehicle capacity constraint of the CVRP. The search space is further restricted by the constraint-preserving mixing operation. We examine and discuss the effectiveness of the proposed encoding under the framework of the variant of the QAOA, Quantum Alternating Operator Ansatz (AOA), through its application to several illustrative examples. Compared to the typical QAOA approach, the proposed method not only preserves the feasibility but also achieves a significant enhancement in the probability of measuring optimal solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.08785v3-abstract-full').style.display = 'none'; document.getElementById('2308.08785v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 10 figures, 1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.15098">arXiv:2306.15098</a> <span> [<a href="https://arxiv.org/pdf/2306.15098">pdf</a>, <a href="https://arxiv.org/format/2306.15098">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3580305.3599447">10.1145/3580305.3599447 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Off-Policy Evaluation of Ranking Policies under Diverse User Behavior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kiyohara%2C+H">Haruka Kiyohara</a>, <a href="/search/cs?searchtype=author&query=Uehara%2C+M">Masatoshi Uehara</a>, <a href="/search/cs?searchtype=author&query=Narita%2C+Y">Yusuke Narita</a>, <a href="/search/cs?searchtype=author&query=Shimizu%2C+N">Nobuyuki Shimizu</a>, <a href="/search/cs?searchtype=author&query=Yamamoto%2C+Y">Yasuo Yamamoto</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.15098v1-abstract-short" style="display: inline;"> Ranking interfaces are everywhere in online platforms. There is thus an ever growing interest in their Off-Policy Evaluation (OPE), aiming towards an accurate performance evaluation of ranking policies using logged data. A de-facto approach for OPE is Inverse Propensity Scoring (IPS), which provides an unbiased and consistent value estimate. However, it becomes extremely inaccurate in the ranking… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15098v1-abstract-full').style.display = 'inline'; document.getElementById('2306.15098v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.15098v1-abstract-full" style="display: none;"> Ranking interfaces are everywhere in online platforms. There is thus an ever growing interest in their Off-Policy Evaluation (OPE), aiming towards an accurate performance evaluation of ranking policies using logged data. A de-facto approach for OPE is Inverse Propensity Scoring (IPS), which provides an unbiased and consistent value estimate. However, it becomes extremely inaccurate in the ranking setup due to its high variance under large action spaces. To deal with this problem, previous studies assume either independent or cascade user behavior, resulting in some ranking versions of IPS. While these estimators are somewhat effective in reducing the variance, all existing estimators apply a single universal assumption to every user, causing excessive bias and variance. Therefore, this work explores a far more general formulation where user behavior is diverse and can vary depending on the user context. We show that the resulting estimator, which we call Adaptive IPS (AIPS), can be unbiased under any complex user behavior. Moreover, AIPS achieves the minimum variance among all unbiased estimators based on IPS. We further develop a procedure to identify the appropriate user behavior model to minimize the mean squared error (MSE) of AIPS in a data-driven fashion. Extensive experiments demonstrate that the empirical accuracy improvement can be significant, enabling effective OPE of ranking systems even under diverse user behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15098v1-abstract-full').style.display = 'none'; document.getElementById('2306.15098v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">KDD2023 Research track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.12169">arXiv:2306.12169</a> <span> [<a href="https://arxiv.org/pdf/2306.12169">pdf</a>, <a href="https://arxiv.org/format/2306.12169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> HumanDiffusion: diffusion model using perceptual gradients </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ueda%2C+Y">Yota Ueda</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Takamune%2C+N">Norihiro Takamune</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.12169v1-abstract-short" style="display: inline;"> We propose {\it HumanDiffusion,} a diffusion model trained from humans' perceptual gradients to learn an acceptable range of data for humans (i.e., human-acceptable distribution). Conventional HumanGAN aims to model the human-acceptable distribution wider than the real-data distribution by training a neural network-based generator with human-based discriminators. However, HumanGAN training tends t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12169v1-abstract-full').style.display = 'inline'; document.getElementById('2306.12169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.12169v1-abstract-full" style="display: none;"> We propose {\it HumanDiffusion,} a diffusion model trained from humans' perceptual gradients to learn an acceptable range of data for humans (i.e., human-acceptable distribution). Conventional HumanGAN aims to model the human-acceptable distribution wider than the real-data distribution by training a neural network-based generator with human-based discriminators. However, HumanGAN training tends to converge in a meaningless distribution due to the gradient vanishing or mode collapse and requires careful heuristics. In contrast, our HumanDiffusion learns the human-acceptable distribution through Langevin dynamics based on gradients of human perceptual evaluations. Our training iterates a process to diffuse real data to cover a wider human-acceptable distribution and can avoid the issues in the HumanGAN training. The evaluation results demonstrate that our HumanDiffusion can successfully represent the human-acceptable distribution without any heuristics for the training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12169v1-abstract-full').style.display = 'none'; document.getElementById('2306.12169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proceedings of INTERSPEECH</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.10656">arXiv:2306.10656</a> <span> [<a href="https://arxiv.org/pdf/2306.10656">pdf</a>, <a href="https://arxiv.org/format/2306.10656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Virtual Human Generative Model: Masked Modeling Approach for Learning Human Characteristics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oono%2C+K">Kenta Oono</a>, <a href="/search/cs?searchtype=author&query=Charoenphakdee%2C+N">Nontawat Charoenphakdee</a>, <a href="/search/cs?searchtype=author&query=Bito%2C+K">Kotatsu Bito</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhengyan Gao</a>, <a href="/search/cs?searchtype=author&query=Igata%2C+H">Hideyoshi Igata</a>, <a href="/search/cs?searchtype=author&query=Yoshikawa%2C+M">Masashi Yoshikawa</a>, <a href="/search/cs?searchtype=author&query=Ota%2C+Y">Yoshiaki Ota</a>, <a href="/search/cs?searchtype=author&query=Okui%2C+H">Hiroki Okui</a>, <a href="/search/cs?searchtype=author&query=Akita%2C+K">Kei Akita</a>, <a href="/search/cs?searchtype=author&query=Yamaguchi%2C+S">Shoichiro Yamaguchi</a>, <a href="/search/cs?searchtype=author&query=Sugawara%2C+Y">Yohei Sugawara</a>, <a href="/search/cs?searchtype=author&query=Maeda%2C+S">Shin-ichi Maeda</a>, <a href="/search/cs?searchtype=author&query=Miyoshi%2C+K">Kunihiko Miyoshi</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Tsuda%2C+K">Koki Tsuda</a>, <a href="/search/cs?searchtype=author&query=Maruyama%2C+H">Hiroshi Maruyama</a>, <a href="/search/cs?searchtype=author&query=Hayashi%2C+K">Kohei Hayashi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.10656v3-abstract-short" style="display: inline;"> Identifying the relationship between healthcare attributes, lifestyles, and personality is vital for understanding and improving physical and mental well-being. Machine learning approaches are promising for modeling their relationships and offering actionable suggestions. In this paper, we propose Virtual Human Generative Model (VHGM), a machine learning model for estimating healthcare, lifestyles… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10656v3-abstract-full').style.display = 'inline'; document.getElementById('2306.10656v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.10656v3-abstract-full" style="display: none;"> Identifying the relationship between healthcare attributes, lifestyles, and personality is vital for understanding and improving physical and mental well-being. Machine learning approaches are promising for modeling their relationships and offering actionable suggestions. In this paper, we propose Virtual Human Generative Model (VHGM), a machine learning model for estimating healthcare, lifestyles, and personality attributes. VHGM is a deep generative model trained with masked modeling to learn the joint distribution of attributes conditioned on known ones. Using heterogeneous tabular datasets, VHGM learns more than 2,000 attributes efficiently. We numerically evaluate the performance of VHGM and its training techniques and have deployed VHGM as a Web service, enabling various healthcare applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10656v3-abstract-full').style.display = 'none'; document.getElementById('2306.10656v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.16807">arXiv:2305.16807</a> <span> [<a href="https://arxiv.org/pdf/2305.16807">pdf</a>, <a href="https://arxiv.org/format/2305.16807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Negative-prompt Inversion: Fast Image Inversion for Editing with Text-guided Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miyake%2C+D">Daiki Miyake</a>, <a href="/search/cs?searchtype=author&query=Iohara%2C+A">Akihiro Iohara</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yu Saito</a>, <a href="/search/cs?searchtype=author&query=Tanaka%2C+T">Toshiyuki Tanaka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.16807v1-abstract-short" style="display: inline;"> In image editing employing diffusion models, it is crucial to preserve the reconstruction quality of the original image while changing its style. Although existing methods ensure reconstruction quality through optimization, a drawback of these is the significant amount of time required for optimization. In this paper, we propose negative-prompt inversion, a method capable of achieving equivalent r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.16807v1-abstract-full').style.display = 'inline'; document.getElementById('2305.16807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.16807v1-abstract-full" style="display: none;"> In image editing employing diffusion models, it is crucial to preserve the reconstruction quality of the original image while changing its style. Although existing methods ensure reconstruction quality through optimization, a drawback of these is the significant amount of time required for optimization. In this paper, we propose negative-prompt inversion, a method capable of achieving equivalent reconstruction solely through forward propagation without optimization, thereby enabling much faster editing processes. We experimentally demonstrate that the reconstruction quality of our method is comparable to that of existing methods, allowing for inversion at a resolution of 512 pixels and with 50 sampling steps within approximately 5 seconds, which is more than 30 times faster than null-text inversion. Reduction of the computation time by the proposed method further allows us to use a larger number of sampling steps in diffusion models to improve the reconstruction quality with a moderate increase in computation time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.16807v1-abstract-full').style.display = 'none'; document.getElementById('2305.16807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13724">arXiv:2305.13724</a> <span> [<a href="https://arxiv.org/pdf/2305.13724">pdf</a>, <a href="https://arxiv.org/ps/2305.13724">ps</a>, <a href="https://arxiv.org/format/2305.13724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ChatGPT-EDSS: Empathetic Dialogue Speech Synthesis Trained from ChatGPT-derived Context Word Embeddings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Iimori%2C+E">Eiji Iimori</a>, <a href="/search/cs?searchtype=author&query=Tachibana%2C+K">Kentaro Tachibana</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13724v1-abstract-short" style="display: inline;"> We propose ChatGPT-EDSS, an empathetic dialogue speech synthesis (EDSS) method using ChatGPT for extracting dialogue context. ChatGPT is a chatbot that can deeply understand the content and purpose of an input prompt and appropriately respond to the user's request. We focus on ChatGPT's reading comprehension and introduce it to EDSS, a task of synthesizing speech that can empathize with the interl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13724v1-abstract-full').style.display = 'inline'; document.getElementById('2305.13724v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13724v1-abstract-full" style="display: none;"> We propose ChatGPT-EDSS, an empathetic dialogue speech synthesis (EDSS) method using ChatGPT for extracting dialogue context. ChatGPT is a chatbot that can deeply understand the content and purpose of an input prompt and appropriately respond to the user's request. We focus on ChatGPT's reading comprehension and introduce it to EDSS, a task of synthesizing speech that can empathize with the interlocutor's emotion. Our method first gives chat history to ChatGPT and asks it to generate three words representing the intention, emotion, and speaking style for each line in the chat. Then, it trains an EDSS model using the embeddings of ChatGPT-derived context words as the conditioning features. The experimental results demonstrate that our method performs comparably to ones using emotion labels or neural network-derived context embeddings learned from chat histories. The collected ChatGPT-derived context information is available at https://sarulab-speech.github.io/demo_ChatGPT_EDSS/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13724v1-abstract-full').style.display = 'none'; document.getElementById('2305.13724v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, accepted for INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13713">arXiv:2305.13713</a> <span> [<a href="https://arxiv.org/pdf/2305.13713">pdf</a>, <a href="https://arxiv.org/ps/2305.13713">ps</a>, <a href="https://arxiv.org/format/2305.13713">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CALLS: Japanese Empathetic Dialogue Speech Corpus of Complaint Handling and Attentive Listening in Customer Center </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Iimori%2C+E">Eiji Iimori</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Tachibana%2C+K">Kentaro Tachibana</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13713v1-abstract-short" style="display: inline;"> We present CALLS, a Japanese speech corpus that considers phone calls in a customer center as a new domain of empathetic spoken dialogue. The existing STUDIES corpus covers only empathetic dialogue between a teacher and student in a school. To extend the application range of empathetic dialogue speech synthesis (EDSS), we designed our corpus to include the same female speaker as the STUDIES teache… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13713v1-abstract-full').style.display = 'inline'; document.getElementById('2305.13713v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13713v1-abstract-full" style="display: none;"> We present CALLS, a Japanese speech corpus that considers phone calls in a customer center as a new domain of empathetic spoken dialogue. The existing STUDIES corpus covers only empathetic dialogue between a teacher and student in a school. To extend the application range of empathetic dialogue speech synthesis (EDSS), we designed our corpus to include the same female speaker as the STUDIES teacher, acting as an operator in simulated phone calls. We describe a corpus construction methodology and analyze the recorded speech. We also conduct EDSS experiments using the CALLS and STUDIES corpora to investigate the effect of domain differences. The results show that mixing the two corpora during training causes biased improvements in the quality of synthetic speech due to the different degrees of expressiveness. Our project page of the corpus is http://sython.org/Corpus/STUDIES-2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13713v1-abstract-full').style.display = 'none'; document.getElementById('2305.13713v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, accepted for INTERSPEECH2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.08062">arXiv:2305.08062</a> <span> [<a href="https://arxiv.org/pdf/2305.08062">pdf</a>, <a href="https://arxiv.org/format/2305.08062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Off-Policy Evaluation for Large Action Spaces via Conjunct Effect Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Q">Qingyang Ren</a>, <a href="/search/cs?searchtype=author&query=Joachims%2C+T">Thorsten Joachims</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.08062v2-abstract-short" style="display: inline;"> We study off-policy evaluation (OPE) of contextual bandit policies for large discrete action spaces where conventional importance-weighting approaches suffer from excessive variance. To circumvent this variance issue, we propose a new estimator, called OffCEM, that is based on the conjunct effect model (CEM), a novel decomposition of the causal effect into a cluster effect and a residual effect. O… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.08062v2-abstract-full').style.display = 'inline'; document.getElementById('2305.08062v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.08062v2-abstract-full" style="display: none;"> We study off-policy evaluation (OPE) of contextual bandit policies for large discrete action spaces where conventional importance-weighting approaches suffer from excessive variance. To circumvent this variance issue, we propose a new estimator, called OffCEM, that is based on the conjunct effect model (CEM), a novel decomposition of the causal effect into a cluster effect and a residual effect. OffCEM applies importance weighting only to action clusters and addresses the residual causal effect through model-based reward estimation. We show that the proposed estimator is unbiased under a new condition, called local correctness, which only requires that the residual-effect model preserves the relative expected reward differences of the actions within each cluster. To best leverage the CEM and local correctness, we also propose a new two-step procedure for performing model-based estimation that minimizes bias in the first step and variance in the second step. We find that the resulting OffCEM estimator substantially improves bias and variance compared to a range of conventional estimators. Experiments demonstrate that OffCEM provides substantial improvements in OPE especially in the presence of many actions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.08062v2-abstract-full').style.display = 'none'; document.getElementById('2305.08062v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at ICML2023. arXiv admin note: text overlap with arXiv:2202.06317</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.13652">arXiv:2302.13652</a> <span> [<a href="https://arxiv.org/pdf/2302.13652">pdf</a>, <a href="https://arxiv.org/ps/2302.13652">ps</a>, <a href="https://arxiv.org/format/2302.13652">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Duration-aware pause insertion using pre-trained language model for multi-speaker text-to-speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dong Yang</a>, <a href="/search/cs?searchtype=author&query=Koriyama%2C+T">Tomoki Koriyama</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Saeki%2C+T">Takaaki Saeki</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+D">Detai Xin</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.13652v1-abstract-short" style="display: inline;"> Pause insertion, also known as phrase break prediction and phrasing, is an essential part of TTS systems because proper pauses with natural duration significantly enhance the rhythm and intelligibility of synthetic speech. However, conventional phrasing models ignore various speakers' different styles of inserting silent pauses, which can degrade the performance of the model trained on a multi-spe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.13652v1-abstract-full').style.display = 'inline'; document.getElementById('2302.13652v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.13652v1-abstract-full" style="display: none;"> Pause insertion, also known as phrase break prediction and phrasing, is an essential part of TTS systems because proper pauses with natural duration significantly enhance the rhythm and intelligibility of synthetic speech. However, conventional phrasing models ignore various speakers' different styles of inserting silent pauses, which can degrade the performance of the model trained on a multi-speaker speech corpus. To this end, we propose more powerful pause insertion frameworks based on a pre-trained language model. Our approach uses bidirectional encoder representations from transformers (BERT) pre-trained on a large-scale text corpus, injecting speaker embedding to capture various speaker characteristics. We also leverage duration-aware pause insertion for more natural multi-speaker TTS. We develop and evaluate two types of models. The first improves conventional phrasing models on the position prediction of respiratory pauses (RPs), i.e., silent pauses at word transitions without punctuation. It performs speaker-conditioned RP prediction considering contextual information and is used to demonstrate the effect of speaker information on the prediction. The second model is further designed for phoneme-based TTS models and performs duration-aware pause insertion, predicting both RPs and punctuation-indicated pauses (PIPs) that are categorized by duration. The evaluation results show that our models improve the precision and recall of pause insertion and the rhythm of synthetic speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.13652v1-abstract-full').style.display = 'none'; document.getElementById('2302.13652v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.13904">arXiv:2211.13904</a> <span> [<a href="https://arxiv.org/pdf/2211.13904">pdf</a>, <a href="https://arxiv.org/format/2211.13904">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Policy-Adaptive Estimator Selection for Off-Policy Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Udagawa%2C+T">Takuma Udagawa</a>, <a href="/search/cs?searchtype=author&query=Kiyohara%2C+H">Haruka Kiyohara</a>, <a href="/search/cs?searchtype=author&query=Narita%2C+Y">Yusuke Narita</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a>, <a href="/search/cs?searchtype=author&query=Tateno%2C+K">Kei Tateno</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.13904v2-abstract-short" style="display: inline;"> Off-policy evaluation (OPE) aims to accurately evaluate the performance of counterfactual policies using only offline logged data. Although many estimators have been developed, there is no single estimator that dominates the others, because the estimators' accuracy can vary greatly depending on a given OPE task such as the evaluation policy, number of actions, and noise level. Thus, the data-drive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13904v2-abstract-full').style.display = 'inline'; document.getElementById('2211.13904v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.13904v2-abstract-full" style="display: none;"> Off-policy evaluation (OPE) aims to accurately evaluate the performance of counterfactual policies using only offline logged data. Although many estimators have been developed, there is no single estimator that dominates the others, because the estimators' accuracy can vary greatly depending on a given OPE task such as the evaluation policy, number of actions, and noise level. Thus, the data-driven estimator selection problem is becoming increasingly important and can have a significant impact on the accuracy of OPE. However, identifying the most accurate estimator using only the logged data is quite challenging because the ground-truth estimation accuracy of estimators is generally unavailable. This paper studies this challenging problem of estimator selection for OPE for the first time. In particular, we enable an estimator selection that is adaptive to a given OPE task, by appropriately subsampling available logged data and constructing pseudo policies useful for the underlying estimator selection task. Comprehensive experiments on both synthetic and real-world company data demonstrate that the proposed procedure substantially improves the estimator selection compared to a non-adaptive heuristic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13904v2-abstract-full').style.display = 'none'; document.getElementById('2211.13904v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at AAAI'23</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.09916">arXiv:2210.09916</a> <span> [<a href="https://arxiv.org/pdf/2210.09916">pdf</a>, <a href="https://arxiv.org/format/2210.09916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Mid-attribute speaker generation using optimal-transport-based interpolation of Gaussian mixture models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Watanabe%2C+A">Aya Watanabe</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+D">Detai Xin</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.09916v1-abstract-short" style="display: inline;"> In this paper, we propose a method for intermediating multiple speakers' attributes and diversifying their voice characteristics in ``speaker generation,'' an emerging task that aims to synthesize a nonexistent speaker's naturally sounding voice. The conventional TacoSpawn-based speaker generation method represents the distributions of speaker embeddings by Gaussian mixture models (GMMs) condition… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.09916v1-abstract-full').style.display = 'inline'; document.getElementById('2210.09916v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.09916v1-abstract-full" style="display: none;"> In this paper, we propose a method for intermediating multiple speakers' attributes and diversifying their voice characteristics in ``speaker generation,'' an emerging task that aims to synthesize a nonexistent speaker's naturally sounding voice. The conventional TacoSpawn-based speaker generation method represents the distributions of speaker embeddings by Gaussian mixture models (GMMs) conditioned with speaker attributes. Although this method enables the sampling of various speakers from the speaker-attribute-aware GMMs, it is not yet clear whether the learned distributions can represent speakers with an intermediate attribute (i.e., mid-attribute). To this end, we propose an optimal-transport-based method that interpolates the learned GMMs to generate nonexistent speakers with mid-attribute (e.g., gender-neutral) voices. We empirically validate our method and evaluate the naturalness of synthetic speech and the controllability of two speaker attributes: gender and language fluency. The evaluation results show that our method can control the generated speakers' attributes by a continuous scalar value without statistically significant degradation of speech naturalness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.09916v1-abstract-full').style.display = 'none'; document.getElementById('2210.09916v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023. Demo: https://sarulab-speech.github.io/demo_mid-attribute-speaker-generation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.12549">arXiv:2209.12549</a> <span> [<a href="https://arxiv.org/pdf/2209.12549">pdf</a>, <a href="https://arxiv.org/ps/2209.12549">ps</a>, <a href="https://arxiv.org/format/2209.12549">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multi-Task Adversarial Training Algorithm for Multi-Speaker Neural Text-to-Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nakai%2C+Y">Yusuke Nakai</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Udagawa%2C+K">Kenta Udagawa</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.12549v1-abstract-short" style="display: inline;"> We propose a novel training algorithm for a multi-speaker neural text-to-speech (TTS) model based on multi-task adversarial training. A conventional generative adversarial network (GAN)-based training algorithm significantly improves the quality of synthetic speech by reducing the statistical difference between natural and synthetic speech. However, the algorithm does not guarantee the generalizat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.12549v1-abstract-full').style.display = 'inline'; document.getElementById('2209.12549v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.12549v1-abstract-full" style="display: none;"> We propose a novel training algorithm for a multi-speaker neural text-to-speech (TTS) model based on multi-task adversarial training. A conventional generative adversarial network (GAN)-based training algorithm significantly improves the quality of synthetic speech by reducing the statistical difference between natural and synthetic speech. However, the algorithm does not guarantee the generalization performance of the trained TTS model in synthesizing voices of unseen speakers who are not included in the training data. Our algorithm alternatively trains two deep neural networks: multi-task discriminator and multi-speaker neural TTS model (i.e., generator of GANs). The discriminator is trained not only to distinguish between natural and synthetic speech but also to verify the speaker of input speech is existent or non-existent (i.e., newly generated by interpolating seen speakers' embedding vectors). Meanwhile, the generator is trained to minimize the weighted sum of the speech reconstruction loss and adversarial loss for fooling the discriminator, which achieves high-quality multi-speaker TTS even if the target speaker is unseen. Experimental evaluation shows that our algorithm improves the quality of synthetic speech better than a conventional GANSpeech algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.12549v1-abstract-full').style.display = 'none'; document.getElementById('2209.12549v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 1 figure, Accepted for APSIPA ASC 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.10256">arXiv:2206.10256</a> <span> [<a href="https://arxiv.org/pdf/2206.10256">pdf</a>, <a href="https://arxiv.org/format/2206.10256">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Human-in-the-loop Speaker Adaptation for DNN-based Multi-speaker TTS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Udagawa%2C+K">Kenta Udagawa</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.10256v1-abstract-short" style="display: inline;"> This paper proposes a human-in-the-loop speaker-adaptation method for multi-speaker text-to-speech. With a conventional speaker-adaptation method, a target speaker's embedding vector is extracted from his/her reference speech using a speaker encoder trained on a speaker-discriminative task. However, this method cannot obtain an embedding vector for the target speaker when the reference speech is u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.10256v1-abstract-full').style.display = 'inline'; document.getElementById('2206.10256v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.10256v1-abstract-full" style="display: none;"> This paper proposes a human-in-the-loop speaker-adaptation method for multi-speaker text-to-speech. With a conventional speaker-adaptation method, a target speaker's embedding vector is extracted from his/her reference speech using a speaker encoder trained on a speaker-discriminative task. However, this method cannot obtain an embedding vector for the target speaker when the reference speech is unavailable. Our method is based on a human-in-the-loop optimization framework, which incorporates a user to explore the speaker-embedding space to find the target speaker's embedding. The proposed method uses a sequential line search algorithm that repeatedly asks a user to select a point on a line segment in the embedding space. To efficiently choose the best speech sample from multiple stimuli, we also developed a system in which a user can switch between multiple speakers' voices for each phoneme while looping an utterance. Experimental results indicate that the proposed method can achieve comparable performance to the conventional one in objective and subjective evaluations even if reference speech is not used as the input of a speaker encoder directly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.10256v1-abstract-full').style.display = 'none'; document.getElementById('2206.10256v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, Accepted for INTERSPEECH2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.08039">arXiv:2206.08039</a> <span> [<a href="https://arxiv.org/pdf/2206.08039">pdf</a>, <a href="https://arxiv.org/ps/2206.08039">ps</a>, <a href="https://arxiv.org/format/2206.08039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Acoustic Modeling for End-to-End Empathetic Dialogue Speech Synthesis Using Linguistic and Prosodic Contexts of Dialogue History </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nishimura%2C+Y">Yuto Nishimura</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Tachibana%2C+K">Kentaro Tachibana</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.08039v1-abstract-short" style="display: inline;"> We propose an end-to-end empathetic dialogue speech synthesis (DSS) model that considers both the linguistic and prosodic contexts of dialogue history. Empathy is the active attempt by humans to get inside the interlocutor in dialogue, and empathetic DSS is a technology to implement this act in spoken dialogue systems. Our model is conditioned by the history of linguistic and prosody features for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08039v1-abstract-full').style.display = 'inline'; document.getElementById('2206.08039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.08039v1-abstract-full" style="display: none;"> We propose an end-to-end empathetic dialogue speech synthesis (DSS) model that considers both the linguistic and prosodic contexts of dialogue history. Empathy is the active attempt by humans to get inside the interlocutor in dialogue, and empathetic DSS is a technology to implement this act in spoken dialogue systems. Our model is conditioned by the history of linguistic and prosody features for predicting appropriate dialogue context. As such, it can be regarded as an extension of the conventional linguistic-feature-based dialogue history modeling. To train the empathetic DSS model effectively, we investigate 1) a self-supervised learning model pretrained with large speech corpora, 2) a style-guided training using a prosody embedding of the current utterance to be predicted by the dialogue context embedding, 3) a cross-modal attention to combine text and speech modalities, and 4) a sentence-wise embedding to achieve fine-grained prosody modeling rather than utterance-wise modeling. The evaluation results demonstrate that 1) simply considering prosodic contexts of the dialogue history does not improve the quality of speech in empathetic DSS and 2) introducing style-guided training and sentence-wise embedding modeling achieves higher speech quality than that by the conventional method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08039v1-abstract-full').style.display = 'none'; document.getElementById('2206.08039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, Accepted for INTERSPEECH2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.07247">arXiv:2206.07247</a> <span> [<a href="https://arxiv.org/pdf/2206.07247">pdf</a>, <a href="https://arxiv.org/format/2206.07247">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3534678.3539353">10.1145/3534678.3539353 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Fair Ranking as Fair Division: Impact-Based Individual Fairness in Ranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a>, <a href="/search/cs?searchtype=author&query=Joachims%2C+T">Thorsten Joachims</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.07247v2-abstract-short" style="display: inline;"> Rankings have become the primary interface in two-sided online markets. Many have noted that the rankings not only affect the satisfaction of the users (e.g., customers, listeners, employers, travelers), but that the position in the ranking allocates exposure -- and thus economic opportunity -- to the ranked items (e.g., articles, products, songs, job seekers, restaurants, hotels). This has raised… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07247v2-abstract-full').style.display = 'inline'; document.getElementById('2206.07247v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.07247v2-abstract-full" style="display: none;"> Rankings have become the primary interface in two-sided online markets. Many have noted that the rankings not only affect the satisfaction of the users (e.g., customers, listeners, employers, travelers), but that the position in the ranking allocates exposure -- and thus economic opportunity -- to the ranked items (e.g., articles, products, songs, job seekers, restaurants, hotels). This has raised questions of fairness to the items, and most existing works have addressed fairness by explicitly linking item exposure to item relevance. However, we argue that any particular choice of such a link function may be difficult to defend, and we show that the resulting rankings can still be unfair. To avoid these shortcomings, we develop a new axiomatic approach that is rooted in principles of fair division. This not only avoids the need to choose a link function, but also more meaningfully quantifies the impact on the items beyond exposure. Our axioms of envy-freeness and dominance over uniform ranking postulate that for a fair ranking policy every item should prefer their own rank allocation over that of any other item, and that no item should be actively disadvantaged by the rankings. To compute ranking policies that are fair according to these axioms, we propose a new ranking objective related to the Nash Social Welfare. We show that the solution has guarantees regarding its envy-freeness, its dominance over uniform rankings for every item, and its Pareto optimality. In contrast, we show that conventional exposure-based fairness can produce large amounts of envy and have a highly disparate impact on the items. Beyond these theoretical results, we illustrate empirically how our framework controls the trade-off between impact-based individual item fairness and user utility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07247v2-abstract-full').style.display = 'none'; document.getElementById('2206.07247v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at KDD2022, a few minor updates from the camera ready version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.14757">arXiv:2203.14757</a> <span> [<a href="https://arxiv.org/pdf/2203.14757">pdf</a>, <a href="https://arxiv.org/ps/2203.14757">ps</a>, <a href="https://arxiv.org/format/2203.14757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> STUDIES: Corpus of Japanese Empathetic Dialogue Speech Towards Friendly Voice Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuki Saito</a>, <a href="/search/cs?searchtype=author&query=Nishimura%2C+Y">Yuto Nishimura</a>, <a href="/search/cs?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/cs?searchtype=author&query=Tachibana%2C+K">Kentaro Tachibana</a>, <a href="/search/cs?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.14757v2-abstract-short" style="display: inline;"> We present STUDIES, a new speech corpus for developing a voice agent that can speak in a friendly manner. Humans naturally control their speech prosody to empathize with each other. By incorporating this "empathetic dialogue" behavior into a spoken dialogue system, we can develop a voice agent that can respond to a user more naturally. We designed the STUDIES corpus to include a speaker who speaks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14757v2-abstract-full').style.display = 'inline'; document.getElementById('2203.14757v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.14757v2-abstract-full" style="display: none;"> We present STUDIES, a new speech corpus for developing a voice agent that can speak in a friendly manner. Humans naturally control their speech prosody to empathize with each other. By incorporating this "empathetic dialogue" behavior into a spoken dialogue system, we can develop a voice agent that can respond to a user more naturally. We designed the STUDIES corpus to include a speaker who speaks with empathy for the interlocutor's emotion explicitly. We describe our methodology to construct an empathetic dialogue speech corpus and report the analysis results of the STUDIES corpus. We conducted a text-to-speech experiment to initially investigate how we can develop more natural voice agent that can tune its speaking style corresponding to the interlocutor's emotion. The results show that the use of interlocutor's emotion label and conversational context embedding can produce speech with the same degree of naturalness as that synthesized by using the agent's emotion label. Our project page of the STUDIES corpus is http://sython.org/Corpus/STUDIES. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14757v2-abstract-full').style.display = 'none'; document.getElementById('2203.14757v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, Accepted for INTERSPEECH2022, project page: http://sython.org/Corpus/STUDIES</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.13868">arXiv:2202.13868</a> <span> [<a href="https://arxiv.org/pdf/2202.13868">pdf</a>, <a href="https://arxiv.org/format/2202.13868">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/BigData52589.2021.9671800">10.1109/BigData52589.2021.9671800 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Real-World Implementation of Unbiased Lift-based Bidding System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Moriwaki%2C+D">Daisuke Moriwaki</a>, <a href="/search/cs?searchtype=author&query=Hayakawa%2C+Y">Yuta Hayakawa</a>, <a href="/search/cs?searchtype=author&query=Matsui%2C+A">Akira Matsui</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a>, <a href="/search/cs?searchtype=author&query=Munemasa%2C+I">Isshu Munemasa</a>, <a href="/search/cs?searchtype=author&query=Shibata%2C+M">Masashi Shibata</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.13868v1-abstract-short" style="display: inline;"> In display ad auctions of Real-Time Bid-ding (RTB), a typical Demand-Side Platform (DSP)bids based on the predicted probability of click and conversion right after an ad impression. Recent studies find such a strategy is suboptimal and propose a better bidding strategy named lift-based bidding.Lift-based bidding simply bids the price according to the lift effect of the ad impression and achieves m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.13868v1-abstract-full').style.display = 'inline'; document.getElementById('2202.13868v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.13868v1-abstract-full" style="display: none;"> In display ad auctions of Real-Time Bid-ding (RTB), a typical Demand-Side Platform (DSP)bids based on the predicted probability of click and conversion right after an ad impression. Recent studies find such a strategy is suboptimal and propose a better bidding strategy named lift-based bidding.Lift-based bidding simply bids the price according to the lift effect of the ad impression and achieves maximization of target metrics such as sales. Despiteits superiority, lift-based bidding has not yet been widely accepted in the advertising industry. For one reason, lift-based bidding is less profitable for DSP providers under the current billing rule. Second, thepractical usefulness of lift-based bidding is not widely understood in the online advertising industry due to the lack of a comprehensive investigation of its impact.We here propose a practically-implementable lift-based bidding system that perfectly fits the current billing rules. We conduct extensive experiments usinga real-world advertising campaign and examine the performance under various settings. We find that lift-based bidding, especially unbiased lift-based bidding is most profitable for both DSP providers and advertisers. Our ablation study highlights that lift-based bidding has a good property for currently dominant first price auctions. The results will motivate the online <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.13868v1-abstract-full').style.display = 'none'; document.getElementById('2202.13868v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2021 IEEE International Conference on Big Data (Big Data)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.06317">arXiv:2202.06317</a> <span> [<a href="https://arxiv.org/pdf/2202.06317">pdf</a>, <a href="https://arxiv.org/format/2202.06317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Off-Policy Evaluation for Large Action Spaces via Embeddings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a>, <a href="/search/cs?searchtype=author&query=Joachims%2C+T">Thorsten Joachims</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.06317v2-abstract-short" style="display: inline;"> Off-policy evaluation (OPE) in contextual bandits has seen rapid adoption in real-world systems, since it enables offline evaluation of new policies using only historic log data. Unfortunately, when the number of actions is large, existing OPE estimators -- most of which are based on inverse propensity score weighting -- degrade severely and can suffer from extreme bias and variance. This foils th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.06317v2-abstract-full').style.display = 'inline'; document.getElementById('2202.06317v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.06317v2-abstract-full" style="display: none;"> Off-policy evaluation (OPE) in contextual bandits has seen rapid adoption in real-world systems, since it enables offline evaluation of new policies using only historic log data. Unfortunately, when the number of actions is large, existing OPE estimators -- most of which are based on inverse propensity score weighting -- degrade severely and can suffer from extreme bias and variance. This foils the use of OPE in many applications from recommender systems to language models. To overcome this issue, we propose a new OPE estimator that leverages marginalized importance weights when action embeddings provide structure in the action space. We characterize the bias, variance, and mean squared error of the proposed estimator and analyze the conditions under which the action embedding provides statistical benefits over conventional estimators. In addition to the theoretical analysis, we find that the empirical performance improvement can be substantial, enabling reliable OPE even when existing estimators collapse due to a large number of actions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.06317v2-abstract-full').style.display = 'none'; document.getElementById('2202.06317v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at ICML2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.01562">arXiv:2202.01562</a> <span> [<a href="https://arxiv.org/pdf/2202.01562">pdf</a>, <a href="https://arxiv.org/format/2202.01562">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3488560.3498380">10.1145/3488560.3498380 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Doubly Robust Off-Policy Evaluation for Ranking Policies under the Cascade Behavior Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kiyohara%2C+H">Haruka Kiyohara</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a>, <a href="/search/cs?searchtype=author&query=Matsuhiro%2C+T">Tatsuya Matsuhiro</a>, <a href="/search/cs?searchtype=author&query=Narita%2C+Y">Yusuke Narita</a>, <a href="/search/cs?searchtype=author&query=Shimizu%2C+N">Nobuyuki Shimizu</a>, <a href="/search/cs?searchtype=author&query=Yamamoto%2C+Y">Yasuo Yamamoto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.01562v1-abstract-short" style="display: inline;"> In real-world recommender systems and search engines, optimizing ranking decisions to present a ranked list of relevant items is critical. Off-policy evaluation (OPE) for ranking policies is thus gaining a growing interest because it enables performance estimation of new ranking policies using only logged data. Although OPE in contextual bandits has been studied extensively, its naive application… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01562v1-abstract-full').style.display = 'inline'; document.getElementById('2202.01562v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.01562v1-abstract-full" style="display: none;"> In real-world recommender systems and search engines, optimizing ranking decisions to present a ranked list of relevant items is critical. Off-policy evaluation (OPE) for ranking policies is thus gaining a growing interest because it enables performance estimation of new ranking policies using only logged data. Although OPE in contextual bandits has been studied extensively, its naive application to the ranking setting faces a critical variance issue due to the huge item space. To tackle this problem, previous studies introduce some assumptions on user behavior to make the combinatorial item space tractable. However, an unrealistic assumption may, in turn, cause serious bias. Therefore, appropriately controlling the bias-variance tradeoff by imposing a reasonable assumption is the key for success in OPE of ranking policies. To achieve a well-balanced bias-variance tradeoff, we propose the Cascade Doubly Robust estimator building on the cascade assumption, which assumes that a user interacts with items sequentially from the top position in a ranking. We show that the proposed estimator is unbiased in more cases compared to existing estimators that make stronger assumptions. Furthermore, compared to a previous estimator based on the same cascade assumption, the proposed estimator reduces the variance by leveraging a control variate. Comprehensive experiments on both synthetic and real-world data demonstrate that our estimator leads to more accurate OPE than existing estimators in a variety of settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01562v1-abstract-full').style.display = 'none'; document.getElementById('2202.01562v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WSDM2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.08621">arXiv:2109.08621</a> <span> [<a href="https://arxiv.org/pdf/2109.08621">pdf</a>, <a href="https://arxiv.org/ps/2109.08621">ps</a>, <a href="https://arxiv.org/format/2109.08621">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Data-Driven Off-Policy Estimator Selection: An Application in User Marketing on An Online Content Delivery Service </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saito%2C+Y">Yuta Saito</a>, <a href="/search/cs?searchtype=author&query=Udagawa%2C+T">Takuma Udagawa</a>, <a href="/search/cs?searchtype=author&query=Tateno%2C+K">Kei Tateno</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.08621v1-abstract-short" style="display: inline;"> Off-policy evaluation (OPE) is the method that attempts to estimate the performance of decision making policies using historical data generated by different policies without conducting costly online A/B tests. Accurate OPE is essential in domains such as healthcare, marketing or recommender systems to avoid deploying poor performing policies, as such policies may hart human lives or destroy the us… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.08621v1-abstract-full').style.display = 'inline'; document.getElementById('2109.08621v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.08621v1-abstract-full" style="display: none;"> Off-policy evaluation (OPE) is the method that attempts to estimate the performance of decision making policies using historical data generated by different policies without conducting costly online A/B tests. Accurate OPE is essential in domains such as healthcare, marketing or recommender systems to avoid deploying poor performing policies, as such policies may hart human lives or destroy the user experience. Thus, many OPE methods with theoretical backgrounds have been proposed. One emerging challenge with this trend is that a suitable estimator can be different for each application setting. It is often unknown for practitioners which estimator to use for their specific applications and purposes. To find out a suitable estimator among many candidates, we use a data-driven estimator selection procedure for off-policy policy performance estimators as a practical solution. As proof of concept, we use our procedure to select the best estimator to evaluate coupon treatment policies on a real-world online content delivery service. In the experiment, we first observe that a suitable estimator might change with different definitions of the outcome variable, and thus the accurate estimator selection is critical in real-world applications of OPE. Then, we demonstrate that, by utilizing the estimator selection procedure, we can easily find out suitable estimators for each purpose. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.08621v1-abstract-full').style.display = 'none'; document.getElementById('2109.08621v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">presented at REVEAL workshop, RecSys2020</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Saito%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Saito%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Saito%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository