Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 506 results for author: <span class="mathjax">Cheng, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Cheng%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Cheng, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Cheng%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Cheng, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10280">arXiv:2411.10280</a> <span> [<a href="https://arxiv.org/pdf/2411.10280">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> From Score-Driven to Value-Sharing: Understanding Chinese Family Use of AI to Support Decision Making of College Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Si Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jingyi Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haocong Cheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yun Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10280v1-abstract-short" style="display: inline;"> This study investigates how 18-year-old students, parents, and experts in China utilize artificial intelligence (AI) tools to support decision-making in college applications during college entrance exam -- a highly competitive, score-driven, annual national exam. Through 32 interviews, we examine the use of Quark GaoKao, an AI tool that generates college application lists and acceptance probabilit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10280v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10280v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10280v1-abstract-full" style="display: none;"> This study investigates how 18-year-old students, parents, and experts in China utilize artificial intelligence (AI) tools to support decision-making in college applications during college entrance exam -- a highly competitive, score-driven, annual national exam. Through 32 interviews, we examine the use of Quark GaoKao, an AI tool that generates college application lists and acceptance probabilities based on exam scores, historical data, preferred locations, etc. Our findings show that AI tools are predominantly used by parents with limited involvement from students, and often focus on immediate exam results, failing to address long-term career goals. We also identify challenges such as misleading AI recommendations, and irresponsible use of AI by third-party consultant agencies. Finally, we offer design insights to better support multi-stakeholders' decision-making in families, especially in the Chinese context, and discuss how emerging AI tools create barriers for families with fewer resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10280v1-abstract-full').style.display = 'none'; document.getElementById('2411.10280v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09968">arXiv:2411.09968</a> <span> [<a href="https://arxiv.org/pdf/2411.09968">pdf</a>, <a href="https://arxiv.org/format/2411.09968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Seeing Clearly by Layer Two: Enhancing Attention Heads to Alleviate Hallucination in LVLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaofeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+Y">Yihao Quan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+C">Chaochen Gu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaosong Yuan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shaotian Yan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kaijie Wu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09968v1-abstract-short" style="display: inline;"> The hallucination problem in multimodal large language models (MLLMs) remains a common issue. Although image tokens occupy a majority of the input sequence of MLLMs, there is limited research to explore the relationship between image tokens and hallucinations. In this paper, we analyze the distribution of attention scores for image tokens across each layer and head of the model, revealing an intri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09968v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09968v1-abstract-full" style="display: none;"> The hallucination problem in multimodal large language models (MLLMs) remains a common issue. Although image tokens occupy a majority of the input sequence of MLLMs, there is limited research to explore the relationship between image tokens and hallucinations. In this paper, we analyze the distribution of attention scores for image tokens across each layer and head of the model, revealing an intriguing and common phenomenon: most hallucinations are closely linked to the pattern of attention sinks in the self-attention matrix of image tokens, where shallow layers exhibit dense attention sinks and deeper layers show sparse attention sinks. We further analyze the attention heads of different layers and find that heads with high-density attention sink in the image part play a positive role in alleviating hallucinations. In this paper, we propose a training-free method named \textcolor{red}{\textbf{E}}nhancing \textcolor{red}{\textbf{A}}ttention \textcolor{red}{\textbf{H}}eads (EAH), an approach designed to enhance the convergence of image tokens attention sinks in the shallow layers. EAH identifies the attention head that shows the vision sink in a shallow layer and extracts its attention matrix. This attention map is then broadcast to other heads in the layer, thereby strengthening the layer to pay more attention to the image itself. With extensive experiments, EAH shows significant hallucination-mitigating performance on different MLLMs and metrics, proving its effectiveness and generality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09968v1-abstract-full').style.display = 'none'; document.getElementById('2411.09968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09873">arXiv:2411.09873</a> <span> [<a href="https://arxiv.org/pdf/2411.09873">pdf</a>, <a href="https://arxiv.org/format/2411.09873">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> LLM-Powered AI Tutors with Personas for d/Deaf and Hard-of-Hearing Online Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haocong Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Si Chen</a>, <a href="/search/cs?searchtype=author&query=Perdriau%2C+C">Christopher Perdriau</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yun Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09873v1-abstract-short" style="display: inline;"> Intelligent tutoring systems (ITS) using artificial intelligence (AI) technology have shown promise in supporting learners with diverse abilities; however, they often fail to meet the specific communication needs and cultural nuances needed by d/Deaf and Hard-of-Hearing (DHH) learners. As large language models (LLMs) provide new opportunities to incorporate personas to AI-based tutors and support… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09873v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09873v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09873v1-abstract-full" style="display: none;"> Intelligent tutoring systems (ITS) using artificial intelligence (AI) technology have shown promise in supporting learners with diverse abilities; however, they often fail to meet the specific communication needs and cultural nuances needed by d/Deaf and Hard-of-Hearing (DHH) learners. As large language models (LLMs) provide new opportunities to incorporate personas to AI-based tutors and support dynamic interactive dialogue, this paper explores how DHH learners perceive LLM-powered ITS with different personas and identified design suggestions for improving the interaction. We developed an interface that allows DHH learners to interact with ChatGPT and three LLM-powered AI tutors with different experiences in DHH education while the learners watch an educational video. A user study with 16 DHH participants showed that they perceived conversations with the AI tutors who had DHH education experiences to be more human-like and trustworthy due to the tutors' cultural knowledge of DHH communities. Participants also suggested providing more transparency regarding the tutors' background information to clarify each AI tutor's position within the DHH community. We discuss design implications for more inclusive LLM-based systems, such as supports for the multimodality of sign language. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09873v1-abstract-full').style.display = 'none'; document.getElementById('2411.09873v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05877">arXiv:2411.05877</a> <span> [<a href="https://arxiv.org/pdf/2411.05877">pdf</a>, <a href="https://arxiv.org/format/2411.05877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Generative Adapter: Contextualizing Language Models in Parameters with A Single Forward Pass </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hao Fang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+P">Patrick Xia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Van+Durme%2C+B">Benjamin Van Durme</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05877v1-abstract-short" style="display: inline;"> Large language models (LMs) are typically adapted to improve performance on new contexts (\eg text prompts that define new tasks or domains) through fine-tuning or prompting. However, there is an accuracy compute tradeoff -- fine-tuning incurs significant training cost and prompting increases inference overhead. We introduce $GenerativeAdapter$, an effective and efficient adaptation method that di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05877v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05877v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05877v1-abstract-full" style="display: none;"> Large language models (LMs) are typically adapted to improve performance on new contexts (\eg text prompts that define new tasks or domains) through fine-tuning or prompting. However, there is an accuracy compute tradeoff -- fine-tuning incurs significant training cost and prompting increases inference overhead. We introduce $GenerativeAdapter$, an effective and efficient adaptation method that directly maps new contexts to low-rank LM adapters, thereby significantly reducing inference overhead with no need for finetuning. The adapter generator is trained via self-supervised learning, and can be used to adapt a single frozen LM for any new task simply by mapping the associated task or domain context to a new adapter. We apply $GenerativeAdapter$ to two pretrained LMs (Mistral-7B-Instruct and Llama2-7B-Chat) and evaluate the adapted models in three adaption scenarios: knowledge acquisition from documents, learning from demonstrations, and personalization for users. In StreamingQA, our approach is effective in injecting knowledge into the LM's parameters, achieving a 63.5% improvement in F1 score over the model with supervised fine-tuning (from $19.5$ to $31.5$) for contexts as long as 32K tokens. In the MetaICL in-context learning evaluation, our method achieves an average accuracy of $44.9$ across 26 tasks, outperforming the base model. On MSC, our method proves to be highly competitive in memorizing user information from conversations with a 4x reduction in computation and memory costs compared to prompting with full conversation history. Together, these results suggest that $GenerativeAdapter$ should allow for general adaption to a wide range of different contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05877v1-abstract-full').style.display = 'none'; document.getElementById('2411.05877v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05361">arXiv:2411.05361</a> <span> [<a href="https://arxiv.org/pdf/2411.05361">pdf</a>, <a href="https://arxiv.org/format/2411.05361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-SUPERB Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/cs?searchtype=author&query=Diwan%2C+A">Anuj Diwan</a>, <a href="/search/cs?searchtype=author&query=Shih%2C+Y">Yi-Jen Shih</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Hsiao%2C+C">Chi-Yuan Hsiao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+P">Puyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shih-Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Ritter-Gutierrez%2C+F">Fabian Ritter-Gutierrez</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+M+T">Ming To Chuang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kuan-Po Huang</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">You-Kuan Lin</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+E">Eunjung Yeo</a> , et al. (53 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05361v1-abstract-short" style="display: inline;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05361v1-abstract-full" style="display: none;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluation benchmark poses a significant challenge. We present Dynamic-SUPERB Phase-2, an open and evolving benchmark for the comprehensive evaluation of instruction-based universal speech models. Building upon the first generation, this second version incorporates 125 new tasks contributed collaboratively by the global research community, expanding the benchmark to a total of 180 tasks, making it the largest benchmark for speech and audio evaluation. While the first generation of Dynamic-SUPERB was limited to classification tasks, Dynamic-SUPERB Phase-2 broadens its evaluation capabilities by introducing a wide array of novel and diverse tasks, including regression and sequence generation, across speech, music, and environmental audio. Evaluation results indicate that none of the models performed well universally. SALMONN-13B excelled in English ASR, while WavLLM demonstrated high accuracy in emotion recognition, but current models still require further innovations to handle a broader range of tasks. We will soon open-source all task data and the evaluation pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'none'; document.getElementById('2411.05361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00684">arXiv:2411.00684</a> <span> [<a href="https://arxiv.org/pdf/2411.00684">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Explainable few-shot learning workflow for detecting invasive and exotic tree species </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gevaert%2C+C+M">Caroline M. Gevaert</a>, <a href="/search/cs?searchtype=author&query=Pedro%2C+A+A">Alexandra Aguiar Pedro</a>, <a href="/search/cs?searchtype=author&query=Ku%2C+O">Ou Ku</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Chandramouli%2C+P">Pranav Chandramouli</a>, <a href="/search/cs?searchtype=author&query=Javan%2C+F+D">Farzaneh Dadrass Javan</a>, <a href="/search/cs?searchtype=author&query=Nattino%2C+F">Francesco Nattino</a>, <a href="/search/cs?searchtype=author&query=Georgievska%2C+S">Sonja Georgievska</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00684v1-abstract-short" style="display: inline;"> Deep Learning methods are notorious for relying on extensive labeled datasets to train and assess their performance. This can cause difficulties in practical situations where models should be trained for new applications for which very little data is available. While few-shot learning algorithms can address the first problem, they still lack sufficient explanations for the results. This research p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00684v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00684v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00684v1-abstract-full" style="display: none;"> Deep Learning methods are notorious for relying on extensive labeled datasets to train and assess their performance. This can cause difficulties in practical situations where models should be trained for new applications for which very little data is available. While few-shot learning algorithms can address the first problem, they still lack sufficient explanations for the results. This research presents a workflow that tackles both challenges by proposing an explainable few-shot learning workflow for detecting invasive and exotic tree species in the Atlantic Forest of Brazil using Unmanned Aerial Vehicle (UAV) images. By integrating a Siamese network with explainable AI (XAI), the workflow enables the classification of tree species with minimal labeled data while providing visual, case-based explanations for the predictions. Results demonstrate the effectiveness of the proposed workflow in identifying new tree species, even in data-scarce conditions. With a lightweight backbone, e.g., MobileNet, it achieves a F1-score of 0.86 in 3-shot learning, outperforming a shallow CNN. A set of explanation metrics, i.e., correctness, continuity, and contrastivity, accompanied by visual cases, provide further insights about the prediction results. This approach opens new avenues for using AI and UAVs in forest management and biodiversity conservation, particularly concerning rare or under-studied species. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00684v1-abstract-full').style.display = 'none'; document.getElementById('2411.00684v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19056">arXiv:2410.19056</a> <span> [<a href="https://arxiv.org/pdf/2410.19056">pdf</a>, <a href="https://arxiv.org/format/2410.19056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ReasonAgain: Using Extractable Symbolic Programs to Evaluate Mathematical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaodong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Ben Zhou</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Roth%2C+D">Dan Roth</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19056v1-abstract-short" style="display: inline;"> Existing math datasets evaluate the reasoning abilities of large language models (LLMs) by either using the final answer or the intermediate reasoning steps derived from static examples. However, the former approach fails to surface model's uses of shortcuts and wrong reasoning while the later poses challenges in accommodating alternative solutions. In this work, we seek to use symbolic programs a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19056v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19056v1-abstract-full" style="display: none;"> Existing math datasets evaluate the reasoning abilities of large language models (LLMs) by either using the final answer or the intermediate reasoning steps derived from static examples. However, the former approach fails to surface model's uses of shortcuts and wrong reasoning while the later poses challenges in accommodating alternative solutions. In this work, we seek to use symbolic programs as a means for automated evaluation if a model can consistently produce correct final answers across various inputs to the program. We begin by extracting programs for popular math datasets (GSM8K and MATH) using GPT4-o. For those executable programs verified using the original input-output pairs, they are found to encapsulate the proper reasoning required to solve the original text questions. We then prompt GPT4-o to generate new questions using alternative input-output pairs based the extracted program. We apply the resulting datasets to evaluate a collection of LLMs. In our experiments, we observe significant accuracy drops using our proposed evaluation compared with original static examples, suggesting the fragility of math reasoning in state-of-the-art LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19056v1-abstract-full').style.display = 'none'; document.getElementById('2410.19056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18469">arXiv:2410.18469</a> <span> [<a href="https://arxiv.org/pdf/2410.18469">pdf</a>, <a href="https://arxiv.org/format/2410.18469">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Iterative Self-Tuning LLMs for Enhanced Jailbreaking Capabilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chung-En Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Weiwei Yang</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+T">Tsui-Wei Weng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=San%2C+A">Aidan San</a>, <a href="/search/cs?searchtype=author&query=Galley%2C+M">Michel Galley</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18469v2-abstract-short" style="display: inline;"> Recent research has shown that Large Language Models (LLMs) are vulnerable to automated jailbreak attacks, where adversarial suffixes crafted by algorithms appended to harmful queries bypass safety alignment and trigger unintended responses. Current methods for generating these suffixes are computationally expensive and have low Attack Success Rates (ASR), especially against well-aligned models li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18469v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18469v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18469v2-abstract-full" style="display: none;"> Recent research has shown that Large Language Models (LLMs) are vulnerable to automated jailbreak attacks, where adversarial suffixes crafted by algorithms appended to harmful queries bypass safety alignment and trigger unintended responses. Current methods for generating these suffixes are computationally expensive and have low Attack Success Rates (ASR), especially against well-aligned models like Llama2 and Llama3. To overcome these limitations, we introduce ADV-LLM, an iterative self-tuning process that crafts adversarial LLMs with enhanced jailbreak ability. Our framework significantly reduces the computational cost of generating adversarial suffixes while achieving nearly 100\% ASR on various open-source LLMs. Moreover, it exhibits strong attack transferability to closed-source models, achieving 99% ASR on GPT-3.5 and 49% ASR on GPT-4, despite being optimized solely on Llama3. Beyond improving jailbreak ability, ADV-LLM provides valuable insights for future safety alignment research through its ability to generate large datasets for studying LLM safety. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18469v2-abstract-full').style.display = 'none'; document.getElementById('2410.18469v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18209">arXiv:2410.18209</a> <span> [<a href="https://arxiv.org/pdf/2410.18209">pdf</a>, <a href="https://arxiv.org/format/2410.18209">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CorrectionLM: Self-Corrections with SLM for Dialogue State Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chia-Hsuan Lee</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Ostendorf%2C+M">Mari Ostendorf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18209v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated self-improvement capabilities via feedback and refinement, but current small language models (SLMs) have had limited success in this area. Existing correction approaches often rely on distilling knowledge from LLMs, which imposes significant computation demands. In this work, we introduce CORRECTIONLM, a novel correction framework that enables SLMs to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18209v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18209v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18209v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated self-improvement capabilities via feedback and refinement, but current small language models (SLMs) have had limited success in this area. Existing correction approaches often rely on distilling knowledge from LLMs, which imposes significant computation demands. In this work, we introduce CORRECTIONLM, a novel correction framework that enables SLMs to self-correct using in-context exemplars without LLM involvement. Applied to two dialogue state tracking (DST) tasks in low-resource settings, CORRECTIONLM achieves results similar to a state-of-the-art LLM at a small fraction of the computation costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18209v1-abstract-full').style.display = 'none'; document.getElementById('2410.18209v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16704">arXiv:2410.16704</a> <span> [<a href="https://arxiv.org/pdf/2410.16704">pdf</a>, <a href="https://arxiv.org/format/2410.16704">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Resolvability of classical-quantum channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hayashi%2C+M">Masahito Hayashi</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao-Chung Cheng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Li Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16704v1-abstract-short" style="display: inline;"> Channel resolvability concerns the minimum resolution for approximating the channel output. We study the resolvability of classical-quantum channels in two settings, for the channel output generated from the worst input, and form the fixed independent and identically distributed (i.i.d.) input. The direct part of the worst-input setting is derived from sequential hypothesis testing as it involves… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16704v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16704v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16704v1-abstract-full" style="display: none;"> Channel resolvability concerns the minimum resolution for approximating the channel output. We study the resolvability of classical-quantum channels in two settings, for the channel output generated from the worst input, and form the fixed independent and identically distributed (i.i.d.) input. The direct part of the worst-input setting is derived from sequential hypothesis testing as it involves of non-i.i.d.~inputs. The strong converse of the worst-input setting is obtained via the connection to identification codes. For the fixed-input setting, while the direct part follows from the known quantum soft covering result, we exploit the recent alternative quantum Sanov theorem to solve the strong converse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16704v1-abstract-full').style.display = 'none'; document.getElementById('2410.16704v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 3 figures. Comments are welcome!</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13726">arXiv:2410.13726</a> <span> [<a href="https://arxiv.org/pdf/2410.13726">pdf</a>, <a href="https://arxiv.org/format/2410.13726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DAWN: Dynamic Frame Avatar with Non-autoregressive Diffusion Framework for Talking Head Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hanbo Cheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Limin Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenyu Liu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+P">Pengcheng Xia</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+P">Pengfei Hu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiefeng Ma</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jia Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13726v2-abstract-short" style="display: inline;"> Talking head generation intends to produce vivid and realistic talking head videos from a single portrait and speech audio clip. Although significant progress has been made in diffusion-based talking head generation, almost all methods rely on autoregressive strategies, which suffer from limited context utilization beyond the current generation step, error accumulation, and slower generation speed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13726v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13726v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13726v2-abstract-full" style="display: none;"> Talking head generation intends to produce vivid and realistic talking head videos from a single portrait and speech audio clip. Although significant progress has been made in diffusion-based talking head generation, almost all methods rely on autoregressive strategies, which suffer from limited context utilization beyond the current generation step, error accumulation, and slower generation speed. To address these challenges, we present DAWN (Dynamic frame Avatar With Non-autoregressive diffusion), a framework that enables all-at-once generation of dynamic-length video sequences. Specifically, it consists of two main components: (1) audio-driven holistic facial dynamics generation in the latent motion space, and (2) audio-driven head pose and blink generation. Extensive experiments demonstrate that our method generates authentic and vivid videos with precise lip motions, and natural pose/blink movements. Additionally, with a high generation speed, DAWN possesses strong extrapolation capabilities, ensuring the stable production of high-quality long videos. These results highlight the considerable promise and potential impact of DAWN in the field of talking head video generation. Furthermore, we hope that DAWN sparks further exploration of non-autoregressive approaches in diffusion models. Our code will be publicly available at https://github.com/Hanbo-Cheng/DAWN-pytorch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13726v2-abstract-full').style.display = 'none'; document.getElementById('2410.13726v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12978">arXiv:2410.12978</a> <span> [<a href="https://arxiv.org/pdf/2410.12978">pdf</a>, <a href="https://arxiv.org/format/2410.12978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> ORANSlice: An Open-Source 5G Network Slicing Platform for O-RAN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hai Cheng</a>, <a href="/search/cs?searchtype=author&query=D%27Oro%2C+S">Salvatore D'Oro</a>, <a href="/search/cs?searchtype=author&query=Gangula%2C+R">Rajeev Gangula</a>, <a href="/search/cs?searchtype=author&query=Velumani%2C+S">Sakthivel Velumani</a>, <a href="/search/cs?searchtype=author&query=Villa%2C+D">Davide Villa</a>, <a href="/search/cs?searchtype=author&query=Bonati%2C+L">Leonardo Bonati</a>, <a href="/search/cs?searchtype=author&query=Polese%2C+M">Michele Polese</a>, <a href="/search/cs?searchtype=author&query=Arrobo%2C+G">Gabriel Arrobo</a>, <a href="/search/cs?searchtype=author&query=Maciocco%2C+C">Christian Maciocco</a>, <a href="/search/cs?searchtype=author&query=Melodia%2C+T">Tommaso Melodia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12978v1-abstract-short" style="display: inline;"> Network slicing allows Telecom Operators (TOs) to support service provisioning with diverse Service Level Agreements (SLAs). The combination of network slicing and Open Radio Access Network (RAN) enables TOs to provide more customized network services and higher commercial benefits. However, in the current Open RAN community, an open-source end-to-end slicing solution for 5G is still missing. To b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12978v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12978v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12978v1-abstract-full" style="display: none;"> Network slicing allows Telecom Operators (TOs) to support service provisioning with diverse Service Level Agreements (SLAs). The combination of network slicing and Open Radio Access Network (RAN) enables TOs to provide more customized network services and higher commercial benefits. However, in the current Open RAN community, an open-source end-to-end slicing solution for 5G is still missing. To bridge this gap, we developed ORANSlice, an open-source network slicing-enabled Open RAN system integrated with popular open-source RAN frameworks. ORANSlice features programmable, 3GPP-compliant RAN slicing and scheduling functionalities. It supports RAN slicing control and optimization via xApps on the near-real-time RAN Intelligent Controller (RIC) thanks to an extension of the E2 interface between RIC and RAN, and service models for slicing. We deploy and test ORANSlice on different O-RAN testbeds and demonstrate its capabilities on different use cases, including slice prioritization and minimum radio resource guarantee. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12978v1-abstract-full').style.display = 'none'; document.getElementById('2410.12978v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11802">arXiv:2410.11802</a> <span> [<a href="https://arxiv.org/pdf/2410.11802">pdf</a>, <a href="https://arxiv.org/format/2410.11802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FoundTS: Comprehensive and Unified Benchmarking of Foundation Models for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xiangfei Qiu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yihang Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hanyin Cheng</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Y">Yang Shu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jilin Hu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chenjuan Guo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Aoying Zhou</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Q">Qingsong Wen</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+C+S">Christian S. Jensen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11802v3-abstract-short" style="display: inline;"> Time Series Forecasting (TSF) is key functionality in numerous fields, including in finance, weather services, and energy management. While TSF methods are emerging these days, many of them require domain-specific data collection and model training and struggle with poor generalization performance on new domains. Foundation models aim to overcome this limitation. Pre-trained on large-scale languag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11802v3-abstract-full').style.display = 'inline'; document.getElementById('2410.11802v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11802v3-abstract-full" style="display: none;"> Time Series Forecasting (TSF) is key functionality in numerous fields, including in finance, weather services, and energy management. While TSF methods are emerging these days, many of them require domain-specific data collection and model training and struggle with poor generalization performance on new domains. Foundation models aim to overcome this limitation. Pre-trained on large-scale language or time series data, they exhibit promising inferencing capabilities in new or unseen data. This has spurred a surge in new TSF foundation models. We propose a new benchmark, FoundTS, to enable thorough and fair evaluation and comparison of such models. FoundTS covers a variety of TSF foundation models, including those based on large language models and those pretrained on time series. Next, FoundTS supports different forecasting strategies, including zero-shot, few-shot, and full-shot, thereby facilitating more thorough evaluations. Finally, FoundTS offers a pipeline that standardizes evaluation processes such as dataset splitting, loading, normalization, and few-shot sampling, thereby facilitating fair evaluations. Building on this, we report on an extensive evaluation of TSF foundation models on a broad range of datasets from diverse domains and with different statistical characteristics. Specifically, we identify pros and cons and inherent limitations of existing foundation models, and we identify directions for future model design. We make our code and datasets available at https://anonymous.4open.science/r/FoundTS-C2B0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11802v3-abstract-full').style.display = 'none'; document.getElementById('2410.11802v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11719">arXiv:2410.11719</a> <span> [<a href="https://arxiv.org/pdf/2410.11719">pdf</a>, <a href="https://arxiv.org/format/2410.11719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Coordinators and Prompts on Heterogeneous Graphs for Cross-Domain Recommendations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunxu Shen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiangguo Sun</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+Y">Yu Rong</a>, <a href="/search/cs?searchtype=author&query=Piao%2C+C">Chengzhi Piao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hong Cheng</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+L">Lingling Yi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11719v1-abstract-short" style="display: inline;"> In the online digital world, users frequently engage with diverse items across multiple domains (e.g., e-commerce platforms, streaming services, and social media networks), forming complex heterogeneous interaction graphs. Leveraging this multi-domain information can undoubtedly enhance the performance of recommendation systems by providing more comprehensive user insights and alleviating data spa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11719v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11719v1-abstract-full" style="display: none;"> In the online digital world, users frequently engage with diverse items across multiple domains (e.g., e-commerce platforms, streaming services, and social media networks), forming complex heterogeneous interaction graphs. Leveraging this multi-domain information can undoubtedly enhance the performance of recommendation systems by providing more comprehensive user insights and alleviating data sparsity in individual domains. However, integrating multi-domain knowledge for the cross-domain recommendation is very hard due to inherent disparities in user behavior and item characteristics and the risk of negative transfer, where irrelevant or conflicting information from the source domains adversely impacts the target domain's performance. To address these challenges, we offer HAGO, a novel framework with $\textbf{H}$eterogeneous $\textbf{A}$daptive $\textbf{G}$raph co$\textbf{O}$rdinators, which dynamically integrate multi-domain graphs into a cohesive structure by adaptively adjusting the connections between coordinators and multi-domain graph nodes, thereby enhancing beneficial inter-domain interactions while mitigating negative transfer effects. Additionally, we develop a universal multi-domain graph pre-training strategy alongside HAGO to collaboratively learn high-quality node representations across domains. To effectively transfer the learned multi-domain knowledge to the target domain, we design an effective graph prompting method, which incorporates pre-trained embeddings with learnable prompts for the recommendation task. Our framework is compatible with various graph-based models and pre-training techniques, demonstrating broad applicability and effectiveness. Further experimental results show that our solutions outperform state-of-the-art methods in multi-domain recommendation scenarios and highlight their potential for real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11719v1-abstract-full').style.display = 'none'; document.getElementById('2410.11719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08937">arXiv:2410.08937</a> <span> [<a href="https://arxiv.org/pdf/2410.08937">pdf</a>, <a href="https://arxiv.org/format/2410.08937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Distributed Quantum Hypothesis Testing under Zero-rate Communication Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sreekumar%2C+S">Sreejith Sreekumar</a>, <a href="/search/cs?searchtype=author&query=Hirche%2C+C">Christoph Hirche</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao-Chung Cheng</a>, <a href="/search/cs?searchtype=author&query=Berta%2C+M">Mario Berta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08937v1-abstract-short" style="display: inline;"> The trade-offs between error probabilities in quantum hypothesis testing are by now well-understood in the centralized setting, but much less is known for distributed settings. Here, we study a distributed binary hypothesis testing problem to infer a bipartite quantum state shared between two remote parties, where one of these parties communicates classical information to the tester at zero-rate (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08937v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08937v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08937v1-abstract-full" style="display: none;"> The trade-offs between error probabilities in quantum hypothesis testing are by now well-understood in the centralized setting, but much less is known for distributed settings. Here, we study a distributed binary hypothesis testing problem to infer a bipartite quantum state shared between two remote parties, where one of these parties communicates classical information to the tester at zero-rate (while the other party communicates classical or quantum information to the tester at zero-rate or higher). As our main contribution, we derive an efficiently computable single-letter formula for the Stein's exponent of this problem, when the state under the alternative is product. For the general case, we show that the Stein's exponent is given by a multi-letter expression involving max-min optimization of regularized measured relative entropy. While this becomes single-letter for the fully classical case, we further prove that this already does not happen in the same way for classical-quantum states in general. As a key tool for proving the converse direction of our results, we develop a quantum version of the blowing-up lemma which may be of independent interest. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08937v1-abstract-full').style.display = 'none'; document.getElementById('2410.08937v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08739">arXiv:2410.08739</a> <span> [<a href="https://arxiv.org/pdf/2410.08739">pdf</a>, <a href="https://arxiv.org/format/2410.08739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> MMLF: Multi-modal Multi-class Late Fusion for Object Detection with Uncertainty Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qihang Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hong Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08739v1-abstract-short" style="display: inline;"> Autonomous driving necessitates advanced object detection techniques that integrate information from multiple modalities to overcome the limitations associated with single-modal approaches. The challenges of aligning diverse data in early fusion and the complexities, along with overfitting issues introduced by deep fusion, underscore the efficacy of late fusion at the decision level. Late fusion e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08739v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08739v1-abstract-full" style="display: none;"> Autonomous driving necessitates advanced object detection techniques that integrate information from multiple modalities to overcome the limitations associated with single-modal approaches. The challenges of aligning diverse data in early fusion and the complexities, along with overfitting issues introduced by deep fusion, underscore the efficacy of late fusion at the decision level. Late fusion ensures seamless integration without altering the original detector's network structure. This paper introduces a pioneering Multi-modal Multi-class Late Fusion method, designed for late fusion to enable multi-class detection. Fusion experiments conducted on the KITTI validation and official test datasets illustrate substantial performance improvements, presenting our model as a versatile solution for multi-modal object detection in autonomous driving. Moreover, our approach incorporates uncertainty analysis into the classification fusion process, rendering our model more transparent and trustworthy and providing more reliable insights into category predictions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08739v1-abstract-full').style.display = 'none'; document.getElementById('2410.08739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07051">arXiv:2410.07051</a> <span> [<a href="https://arxiv.org/pdf/2410.07051">pdf</a>, <a href="https://arxiv.org/format/2410.07051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Exponents for Shared Randomness-Assisted Channel Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oufkir%2C+A">Aadil Oufkir</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M+X">Michael X. Cao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao-Chung Cheng</a>, <a href="/search/cs?searchtype=author&query=Berta%2C+M">Mario Berta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07051v1-abstract-short" style="display: inline;"> We determine the exact error and strong converse exponents of shared randomness-assisted channel simulation in worst case total-variation distance. Namely, we find that these exponents can be written as simple optimizations over the R茅nyi channel mutual information. Strikingly, and in stark contrast to channel coding, there are no critical rates, allowing a tight characterization for arbitrary rat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07051v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07051v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07051v1-abstract-full" style="display: none;"> We determine the exact error and strong converse exponents of shared randomness-assisted channel simulation in worst case total-variation distance. Namely, we find that these exponents can be written as simple optimizations over the R茅nyi channel mutual information. Strikingly, and in stark contrast to channel coding, there are no critical rates, allowing a tight characterization for arbitrary rates below and above the simulation capacity. We derive our results by asymptotically expanding the meta-converse for channel simulation [Cao {\it et al.}, IEEE Trans.~Inf.~Theory (2024)], which corresponds to non-signaling assisted codes. We prove this to be asymptotically tight by employing the approximation algorithms from [Berta {\it et al.}, Proc.~IEEE ISIT (2024)], which show how to round any non-signaling assisted strategy to a strategy that only uses shared randomness. Notably, this implies that any additional quantum entanglement-assistance does not change the error or the strong converse exponents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07051v1-abstract-full').style.display = 'none'; document.getElementById('2410.07051v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27+6 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06934">arXiv:2410.06934</a> <span> [<a href="https://arxiv.org/pdf/2410.06934">pdf</a>, <a href="https://arxiv.org/format/2410.06934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> VEC-Sim: A Simulation Platform for Evaluating Service Caching and Computation Offloading Policies in Vehicular Edge Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaolong Xu</a>, <a href="/search/cs?searchtype=author&query=Bilal%2C+M">Muhammad Bilal</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangwei Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Siyu Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06934v1-abstract-short" style="display: inline;"> Computer simulation platforms offer an alternative solution by emulating complex systems in a controlled manner. However, existing Edge Computing (EC) simulators, as well as general-purpose vehicular network simulators, are not tailored for VEC and lack dedicated support for modeling the distinct access pattern, entity mobility trajectory and other unique characteristics of VEC networks. To fill t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06934v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06934v1-abstract-full" style="display: none;"> Computer simulation platforms offer an alternative solution by emulating complex systems in a controlled manner. However, existing Edge Computing (EC) simulators, as well as general-purpose vehicular network simulators, are not tailored for VEC and lack dedicated support for modeling the distinct access pattern, entity mobility trajectory and other unique characteristics of VEC networks. To fill this gap, this paper proposes VEC-Sim, a versatile simulation platform for in-depth evaluation and analysis of various service caching and computation offloading policies in VEC networks. VEC-Sim incorporates realistic mechanisms to replicate real-world access patterns, including service feature vector, vehicle mobility modeling, evolving service popularity, new service upload and user preference shifts, etc. Moreover, its modular architecture and extensive Application Programming Interfaces (APIs) allow seamless integration of customized scheduling policies and user-defined metrics. A comprehensive evaluation of VEC-Sim's capabilities is undertaken in comparison to real-world ground truths. Results prove it to be accurate in reproducing classical scheduling algorithms and extremely effective in conducting case studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06934v1-abstract-full').style.display = 'none'; document.getElementById('2410.06934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03111">arXiv:2410.03111</a> <span> [<a href="https://arxiv.org/pdf/2410.03111">pdf</a>, <a href="https://arxiv.org/format/2410.03111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LoRC: Low-Rank Compression for LLMs KV Cache with a Progressive Compression Strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rongzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kuang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuohang Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yelong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03111v1-abstract-short" style="display: inline;"> The Key-Value (KV) cache is a crucial component in serving transformer-based autoregressive large language models (LLMs), enabling faster inference by storing previously computed KV vectors. However, its memory consumption scales linearly with sequence length and batch size, posing a significant bottleneck in LLM deployment. Existing approaches to mitigate this issue include: (1) efficient attenti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03111v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03111v1-abstract-full" style="display: none;"> The Key-Value (KV) cache is a crucial component in serving transformer-based autoregressive large language models (LLMs), enabling faster inference by storing previously computed KV vectors. However, its memory consumption scales linearly with sequence length and batch size, posing a significant bottleneck in LLM deployment. Existing approaches to mitigate this issue include: (1) efficient attention variants integrated in upcycling stages, which requires extensive parameter tuning thus unsuitable for pre-trained LLMs; (2) KV cache compression at test time, primarily through token eviction policies, which often overlook inter-layer dependencies and can be task-specific. This paper introduces an orthogonal approach to KV cache compression. We propose a low-rank approximation of KV weight matrices, allowing for plug-in integration with existing transformer-based LLMs without model retraining. To effectively compress KV cache at the weight level, we adjust for layerwise sensitivity and introduce a progressive compression strategy, which is supported by our theoretical analysis on how compression errors accumulate in deep networks. Our method is designed to function without model tuning in upcycling stages or task-specific profiling in test stages. Extensive experiments with LLaMA models ranging from 8B to 70B parameters across various tasks show that our approach significantly reduces the GPU memory footprint while maintaining performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03111v1-abstract-full').style.display = 'none'; document.getElementById('2410.03111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02249">arXiv:2410.02249</a> <span> [<a href="https://arxiv.org/pdf/2410.02249">pdf</a>, <a href="https://arxiv.org/format/2410.02249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Spiking Neural Network as Adaptive Event Stream Slicer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiahang Cao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mingyuan Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqing Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shibo Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Renjing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02249v2-abstract-short" style="display: inline;"> Event-based cameras are attracting significant interest as they provide rich edge information, high dynamic range, and high temporal resolution. Many state-of-the-art event-based algorithms rely on splitting the events into fixed groups, resulting in the omission of crucial temporal information, particularly when dealing with diverse motion scenarios (\eg, high/low speed).In this work, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02249v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02249v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02249v2-abstract-full" style="display: none;"> Event-based cameras are attracting significant interest as they provide rich edge information, high dynamic range, and high temporal resolution. Many state-of-the-art event-based algorithms rely on splitting the events into fixed groups, resulting in the omission of crucial temporal information, particularly when dealing with diverse motion scenarios (\eg, high/low speed).In this work, we propose SpikeSlicer, a novel-designed plug-and-play event processing method capable of splitting events stream adaptively.SpikeSlicer utilizes a low-energy spiking neural network (SNN) to trigger event slicing. To guide the SNN to fire spikes at optimal time steps, we propose the Spiking Position-aware Loss (SPA-Loss) to modulate the neuron's state. Additionally, we develop a Feedback-Update training strategy that refines the slicing decisions using feedback from the downstream artificial neural network (ANN). Extensive experiments demonstrate that our method yields significant performance improvements in event-based object tracking and recognition. Notably, SpikeSlicer provides a brand-new SNN-ANN cooperation paradigm, where the SNN acts as an efficient, low-energy data processor to assist the ANN in improving downstream performance, injecting new perspectives and potential avenues of exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02249v2-abstract-full').style.display = 'none'; document.getElementById('2410.02249v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02052">arXiv:2410.02052</a> <span> [<a href="https://arxiv.org/pdf/2410.02052">pdf</a>, <a href="https://arxiv.org/format/2410.02052">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ExACT: Teaching AI Agents to Explore with Reflective-MCTS and Exploratory Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiao Yu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Baolin Peng</a>, <a href="/search/cs?searchtype=author&query=Vajipey%2C+V">Vineeth Vajipey</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Galley%2C+M">Michel Galley</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhou Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02052v3-abstract-short" style="display: inline;"> Autonomous agents have demonstrated significant potential in automating complex multistep decision-making tasks. However, even state-of-the-art vision-language models (VLMs), such as GPT-4o, still fall short of human-level performance, particularly in intricate web environments and long-horizon tasks. To address these limitations, we present ExACT, an approach to combine test-time search and self-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02052v3-abstract-full').style.display = 'inline'; document.getElementById('2410.02052v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02052v3-abstract-full" style="display: none;"> Autonomous agents have demonstrated significant potential in automating complex multistep decision-making tasks. However, even state-of-the-art vision-language models (VLMs), such as GPT-4o, still fall short of human-level performance, particularly in intricate web environments and long-horizon tasks. To address these limitations, we present ExACT, an approach to combine test-time search and self-learning to build o1-like models for agentic applications. We first introduce Reflective Monte Carlo Tree Search (R-MCTS), a novel test time algorithm designed to enhance AI agents' ability to explore decision space on the fly. R-MCTS extends traditional MCTS by 1) incorporating contrastive reflection, allowing agents to learn from past interactions and dynamically improve their search efficiency; and 2) using multi-agent debate for reliable state evaluation. Next, we introduce Exploratory Learning, a novel learning strategy to teach agents to search at inference time without relying on any external search algorithms. On the challenging VisualWebArena benchmark, our GPT-4o based R-MCTS agent achieves a 6% to 30% relative improvement across various tasks compared to the previous state-of-the-art. Additionally, we show that the knowledge and experience gained from test-time search can be effectively transferred back to GPT-4o via fine-tuning. After Exploratory Learning, GPT-4o 1) demonstrates the ability to explore the environment, evaluate a state, and backtrack to viable ones when it detects that the current state cannot lead to success, and 2) matches 87% of R-MCTS's performance while using significantly less compute. Notably, our work demonstrates the compute scaling properties in both training - data collection with R-MCTS - and testing time. These results suggest a promising research direction to enhance VLMs' capabilities for agentic applications via test-time search and self-learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02052v3-abstract-full').style.display = 'none'; document.getElementById('2410.02052v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01635">arXiv:2410.01635</a> <span> [<a href="https://arxiv.org/pdf/2410.01635">pdf</a>, <a href="https://arxiv.org/format/2410.01635">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Does Graph Prompt Work? A Data Operation Perspective with Theoretical Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qunzhong Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiangguo Sun</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hong Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01635v1-abstract-short" style="display: inline;"> In recent years, graph prompting has emerged as a promising research direction, enabling the learning of additional tokens or subgraphs appended to the original graphs without requiring retraining of pre-trained graph models across various applications. This novel paradigm, shifting from the traditional pretraining and finetuning to pretraining and prompting has shown significant empirical success… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01635v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01635v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01635v1-abstract-full" style="display: none;"> In recent years, graph prompting has emerged as a promising research direction, enabling the learning of additional tokens or subgraphs appended to the original graphs without requiring retraining of pre-trained graph models across various applications. This novel paradigm, shifting from the traditional pretraining and finetuning to pretraining and prompting has shown significant empirical success in simulating graph data operations, with applications ranging from recommendation systems to biological networks and graph transferring. However, despite its potential, the theoretical underpinnings of graph prompting remain underexplored, raising critical questions about its fundamental effectiveness. The lack of rigorous theoretical proof of why and how much it works is more like a dark cloud over the graph prompt area to go further. To fill this gap, this paper introduces a theoretical framework that rigorously analyzes graph prompting from a data operation perspective. Our contributions are threefold: First, we provide a formal guarantee theorem, demonstrating graph prompts capacity to approximate graph transformation operators, effectively linking upstream and downstream tasks. Second, we derive upper bounds on the error of these data operations by graph prompts for a single graph and extend this discussion to batches of graphs, which are common in graph model training. Third, we analyze the distribution of data operation errors, extending our theoretical findings from linear graph models (e.g., GCN) to non-linear graph models (e.g., GAT). Extensive experiments support our theoretical results and confirm the practical implications of these guarantees. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01635v1-abstract-full').style.display = 'none'; document.getElementById('2410.01635v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01604">arXiv:2410.01604</a> <span> [<a href="https://arxiv.org/pdf/2410.01604">pdf</a>, <a href="https://arxiv.org/format/2410.01604">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Customizing Generated Signs and Voices of AI Avatars: Deaf-Centric Mixed-Reality Design for Deaf-Hearing Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Si Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haocong Cheng</a>, <a href="/search/cs?searchtype=author&query=Su%2C+S">Suzy Su</a>, <a href="/search/cs?searchtype=author&query=Patterson%2C+S">Stephanie Patterson</a>, <a href="/search/cs?searchtype=author&query=Kushalnagar%2C+R">Raja Kushalnagar</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yun Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01604v1-abstract-short" style="display: inline;"> This study investigates innovative interaction designs for communication and collaborative learning between learners of mixed hearing and signing abilities, leveraging advancements in mixed reality technologies like Apple Vision Pro and generative AI for animated avatars. Adopting a participatory design approach, we engaged 15 d/Deaf and hard of hearing (DHH) students to brainstorm ideas for an AI… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01604v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01604v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01604v1-abstract-full" style="display: none;"> This study investigates innovative interaction designs for communication and collaborative learning between learners of mixed hearing and signing abilities, leveraging advancements in mixed reality technologies like Apple Vision Pro and generative AI for animated avatars. Adopting a participatory design approach, we engaged 15 d/Deaf and hard of hearing (DHH) students to brainstorm ideas for an AI avatar with interpreting ability (sign language to English, voice to English) that would facilitate their face-to-face communication with hearing peers. Participants envisioned the AI avatars to address some issues with human interpreters, such as lack of availability, and provide affordable options to expensive personalized interpreting service. Our findings indicate a range of preferences for integrating the AI avatars with actual human figures of both DHH and hearing communication partners. The participants highlighted the importance of having control over customizing the AI avatar, such as AI-generated signs, voices, facial expressions, and their synchronization for enhanced emotional display in communication. Based on our findings, we propose a suite of design recommendations that balance respecting sign language norms with adherence to hearing social norms. Our study offers insights on improving the authenticity of generative AI in scenarios involving specific, and sometimes unfamiliar, social norms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01604v1-abstract-full').style.display = 'none'; document.getElementById('2410.01604v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00199">arXiv:2410.00199</a> <span> [<a href="https://arxiv.org/pdf/2410.00199">pdf</a>, <a href="https://arxiv.org/format/2410.00199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Inclusive Emotion Technologies: Addressing the Needs of d/Deaf and Hard of Hearing Learners in Video-Based Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Si Chen</a>, <a href="/search/cs?searchtype=author&query=Situ%2C+J">Jason Situ</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haocong Cheng</a>, <a href="/search/cs?searchtype=author&query=Su%2C+S">Suzy Su</a>, <a href="/search/cs?searchtype=author&query=Kirst%2C+D">Desiree Kirst</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+L">Lu Ming</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Angrave%2C+L">Lawrence Angrave</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yun Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00199v1-abstract-short" style="display: inline;"> Accessibility efforts for d/Deaf and hard of hearing (DHH) learners in video-based learning have mainly focused on captions and interpreters, with limited attention to learners' emotional awareness--an important yet challenging skill for effective learning. Current emotion technologies are designed to support learners' emotional awareness and social needs; however, little is known about whether an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00199v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00199v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00199v1-abstract-full" style="display: none;"> Accessibility efforts for d/Deaf and hard of hearing (DHH) learners in video-based learning have mainly focused on captions and interpreters, with limited attention to learners' emotional awareness--an important yet challenging skill for effective learning. Current emotion technologies are designed to support learners' emotional awareness and social needs; however, little is known about whether and how DHH learners could benefit from these technologies. Our study explores how DHH learners perceive and use emotion data from two collection approaches, self-reported and automatic emotion recognition (AER), in video-based learning. By comparing the use of these technologies between DHH (N=20) and hearing learners (N=20), we identified key differences in their usage and perceptions: 1) DHH learners enhanced their emotional awareness by rewatching the video to self-report their emotions and called for alternative methods for self-reporting emotion, such as using sign language or expressive emoji designs; and 2) while the AER technology could be useful for detecting emotional patterns in learning experiences, DHH learners expressed more concerns about the accuracy and intrusiveness of the AER data. Our findings provide novel design implications for improving the inclusiveness of emotion technologies to support DHH learners, such as leveraging DHH peer learners' emotions to elicit reflections. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00199v1-abstract-full').style.display = 'none'; document.getElementById('2410.00199v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00196">arXiv:2410.00196</a> <span> [<a href="https://arxiv.org/pdf/2410.00196">pdf</a>, <a href="https://arxiv.org/format/2410.00196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Motion Design Principles for Accessible Video-based Learning: Addressing Cognitive Challenges for Deaf and Hard of Hearing Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Si Cheng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haocong Cheng</a>, <a href="/search/cs?searchtype=author&query=Su%2C+S">Suzy Su</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+L">Lu Ming</a>, <a href="/search/cs?searchtype=author&query=Masud%2C+S">Sarah Masud</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yun Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00196v1-abstract-short" style="display: inline;"> Deaf and Hard-of-Hearing (DHH) learners face unique challenges in video-based learning due to the complex interplay between visual and auditory information in videos. Traditional approaches to making video content accessible primarily focus on captioning, but these solutions often neglect the cognitive demands of processing both visual and textual information simultaneously. This paper introduces… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00196v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00196v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00196v1-abstract-full" style="display: none;"> Deaf and Hard-of-Hearing (DHH) learners face unique challenges in video-based learning due to the complex interplay between visual and auditory information in videos. Traditional approaches to making video content accessible primarily focus on captioning, but these solutions often neglect the cognitive demands of processing both visual and textual information simultaneously. This paper introduces a set of \textit{Motion} design guidelines, aimed at mitigating these cognitive challenges and improving video learning experiences for DHH learners. Through a two-phase research, we identified five key challenges, including misaligned content and visual overload. We proposed five design principles accordingly. User study with 16 DHH participants showed that improving visual-audio relevance and guiding visual attention significantly enhances the learning experience by reducing physical demand, alleviating temporal pressure, and improving learning satisfaction. Our findings highlight the potential of Motion design to transform educational content for DHH learners, and we discuss implications for inclusive video learning tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00196v1-abstract-full').style.display = 'none'; document.getElementById('2410.00196v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15810">arXiv:2409.15810</a> <span> [<a href="https://arxiv.org/pdf/2409.15810">pdf</a>, <a href="https://arxiv.org/format/2409.15810">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hyperbolic Image-and-Pointcloud Contrastive Learning for 3D Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+N">Naiwen Hu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haozhe Cheng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yifan Xie</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+P">Pengcheng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jihua Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15810v1-abstract-short" style="display: inline;"> 3D contrastive representation learning has exhibited remarkable efficacy across various downstream tasks. However, existing contrastive learning paradigms based on cosine similarity fail to deeply explore the potential intra-modal hierarchical and cross-modal semantic correlations about multi-modal data in Euclidean space. In response, we seek solutions in hyperbolic space and propose a hyperbolic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15810v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15810v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15810v1-abstract-full" style="display: none;"> 3D contrastive representation learning has exhibited remarkable efficacy across various downstream tasks. However, existing contrastive learning paradigms based on cosine similarity fail to deeply explore the potential intra-modal hierarchical and cross-modal semantic correlations about multi-modal data in Euclidean space. In response, we seek solutions in hyperbolic space and propose a hyperbolic image-and-pointcloud contrastive learning method (HyperIPC). For the intra-modal branch, we rely on the intrinsic geometric structure to explore the hyperbolic embedding representation of point cloud to capture invariant features. For the cross-modal branch, we leverage images to guide the point cloud in establishing strong semantic hierarchical correlations. Empirical experiments underscore the outstanding classification performance of HyperIPC. Notably, HyperIPC enhances object classification results by 2.8% and few-shot classification outcomes by 5.9% on ScanObjectNN compared to the baseline. Furthermore, ablation studies and confirmatory testing validate the rationality of HyperIPC's parameter settings and the effectiveness of its submodules. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15810v1-abstract-full').style.display = 'none'; document.getElementById('2409.15810v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at IROS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15803">arXiv:2409.15803</a> <span> [<a href="https://arxiv.org/pdf/2409.15803">pdf</a>, <a href="https://arxiv.org/format/2409.15803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D-JEPA: A Joint Embedding Predictive Architecture for 3D Self-Supervised Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+N">Naiwen Hu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haozhe Cheng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yifan Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jihua Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15803v1-abstract-short" style="display: inline;"> Invariance-based and generative methods have shown a conspicuous performance for 3D self-supervised representation learning (SSRL). However, the former relies on hand-crafted data augmentations that introduce bias not universally applicable to all downstream tasks, and the latter indiscriminately reconstructs masked regions, resulting in irrelevant details being saved in the representation space.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15803v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15803v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15803v1-abstract-full" style="display: none;"> Invariance-based and generative methods have shown a conspicuous performance for 3D self-supervised representation learning (SSRL). However, the former relies on hand-crafted data augmentations that introduce bias not universally applicable to all downstream tasks, and the latter indiscriminately reconstructs masked regions, resulting in irrelevant details being saved in the representation space. To solve the problem above, we introduce 3D-JEPA, a novel non-generative 3D SSRL framework. Specifically, we propose a multi-block sampling strategy that produces a sufficiently informative context block and several representative target blocks. We present the context-aware decoder to enhance the reconstruction of the target blocks. Concretely, the context information is fed to the decoder continuously, facilitating the encoder in learning semantic modeling rather than memorizing the context information related to target blocks. Overall, 3D-JEPA predicts the representation of target blocks from a context block using the encoder and context-aware decoder architecture. Various downstream tasks on different datasets demonstrate 3D-JEPA's effectiveness and efficiency, achieving higher accuracy with fewer pretraining epochs, e.g., 88.65% accuracy on PB_T50_RS with 150 pretraining epochs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15803v1-abstract-full').style.display = 'none'; document.getElementById('2409.15803v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13174">arXiv:2409.13174</a> <span> [<a href="https://arxiv.org/pdf/2409.13174">pdf</a>, <a href="https://arxiv.org/format/2409.13174">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Manipulation Facing Threats: Evaluating Physical Vulnerabilities in End-to-End Vision Language Action Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+E">Erjia Xiao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chengyuan Yu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zhao Yao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiahang Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaxu Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mengshu Sun</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kaidi Xu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jindong Gu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Renjing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13174v2-abstract-short" style="display: inline;"> Recently, driven by advancements in Multimodal Large Language Models (MLLMs), Vision Language Action Models (VLAMs) are being proposed to achieve better performance in open-vocabulary scenarios for robotic manipulation tasks. Since manipulation tasks involve direct interaction with the physical world, ensuring robustness and safety during the execution of this task is always a very critical issue.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13174v2-abstract-full').style.display = 'inline'; document.getElementById('2409.13174v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13174v2-abstract-full" style="display: none;"> Recently, driven by advancements in Multimodal Large Language Models (MLLMs), Vision Language Action Models (VLAMs) are being proposed to achieve better performance in open-vocabulary scenarios for robotic manipulation tasks. Since manipulation tasks involve direct interaction with the physical world, ensuring robustness and safety during the execution of this task is always a very critical issue. In this paper, by synthesizing current safety research on MLLMs and the specific application scenarios of the manipulation task in the physical world, we comprehensively evaluate VLAMs in the face of potential physical threats. Specifically, we propose the Physical Vulnerability Evaluating Pipeline (PVEP) that can incorporate as many visual modal physical threats as possible for evaluating the physical robustness of VLAMs. The physical threats in PVEP specifically include Out-of-Distribution, Typography-based Visual Prompts, and Adversarial Patch Attacks. By comparing the performance fluctuations of VLAMs before and after being attacked, we provide generalizable Analyses of how VLAMs respond to different physical security threats. Our project page is in this link: https://chaducheng.github.io/Manipulat-Facing-Threats/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13174v2-abstract-full').style.display = 'none'; document.getElementById('2409.13174v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12136">arXiv:2409.12136</a> <span> [<a href="https://arxiv.org/pdf/2409.12136">pdf</a>, <a href="https://arxiv.org/format/2409.12136">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GRIN: GRadient-INformed MoE </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y+J">Young Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuohang Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chen Liang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yelong Shen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Tanaka%2C+M">Masahiro Tanaka</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoxia Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Chaudhary%2C+V">Vishrav Chaudhary</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zeqi Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenruidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jilong Xue</a>, <a href="/search/cs?searchtype=author&query=Awadalla%2C+H">Hany Awadalla</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weizhu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12136v1-abstract-short" style="display: inline;"> Mixture-of-Experts (MoE) models scale more effectively than dense models due to sparse computation through expert routing, selectively activating only a small subset of expert modules. However, sparse computation challenges traditional training practices, as discrete expert routing hinders standard backpropagation and thus gradient-based optimization, which are the cornerstone of deep learning. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12136v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12136v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12136v1-abstract-full" style="display: none;"> Mixture-of-Experts (MoE) models scale more effectively than dense models due to sparse computation through expert routing, selectively activating only a small subset of expert modules. However, sparse computation challenges traditional training practices, as discrete expert routing hinders standard backpropagation and thus gradient-based optimization, which are the cornerstone of deep learning. To better pursue the scaling power of MoE, we introduce GRIN (GRadient-INformed MoE training), which incorporates sparse gradient estimation for expert routing and configures model parallelism to avoid token dropping. Applying GRIN to autoregressive language modeling, we develop a top-2 16$\times$3.8B MoE model. Our model, with only 6.6B activated parameters, outperforms a 7B dense model and matches the performance of a 14B dense model trained on the same data. Extensive evaluations across diverse tasks demonstrate the potential of GRIN to significantly enhance MoE efficacy, achieving 79.4 on MMLU, 83.7 on HellaSwag, 74.4 on HumanEval, and 58.9 on MATH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12136v1-abstract-full').style.display = 'none'; document.getElementById('2409.12136v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">58 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10790">arXiv:2409.10790</a> <span> [<a href="https://arxiv.org/pdf/2409.10790">pdf</a>, <a href="https://arxiv.org/format/2409.10790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Model Tells Itself Where to Attend: Faithfulness Meets Automatic Attention Steering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qingru Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaodong Yu</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+C">Chandan Singh</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tuo Zhao</a>, <a href="/search/cs?searchtype=author&query=Roth%2C+D">Dan Roth</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10790v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable performance across various real-world tasks. However, they often struggle to fully comprehend and effectively utilize their input contexts, resulting in responses that are unfaithful or hallucinated. This difficulty increases for contexts that are long or contain distracting information, which can divert LLMs from fully capturing essential… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10790v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10790v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable performance across various real-world tasks. However, they often struggle to fully comprehend and effectively utilize their input contexts, resulting in responses that are unfaithful or hallucinated. This difficulty increases for contexts that are long or contain distracting information, which can divert LLMs from fully capturing essential evidence. To address this issue, many works use prompting to help LLMs utilize contextual information more faithfully. For instance, iterative prompting highlights key information in two steps that first ask the LLM to identify important pieces of context and then derive answers accordingly. However, prompting methods are constrained to highlighting key information implicitly in token space, which is often insufficient to fully steer the model's attention. To improve model faithfulness more reliably, we propose AutoPASTA, a method that automatically identifies key contextual information and explicitly highlights it by steering an LLM's attention scores. Like prompting, AutoPASTA is applied at inference time and does not require changing any model parameters. Our experiments on open-book QA demonstrate that AutoPASTA effectively enables models to grasp essential contextual information, leading to substantially improved model faithfulness and performance, e.g., an average improvement of 7.95% for LLAMA3-70B-Instruct. Code will be publicly available at https://github.com/QingruZhang/AutoPASTA . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10790v1-abstract-full').style.display = 'none'; document.getElementById('2409.10790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07832">arXiv:2409.07832</a> <span> [<a href="https://arxiv.org/pdf/2409.07832">pdf</a>, <a href="https://arxiv.org/format/2409.07832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Reliable Vector Similarity Search Using Asymmetric Encoding with NAND-Flash for Many-Class Few-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chiang%2C+H">Hao-Wei Chiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chi-Tse Huang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hsiang-Yun Cheng</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+P">Po-Hao Tseng</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Ming-Hsiu Lee</a>, <a href="/search/cs?searchtype=author&query=An-Yeu"> An-Yeu</a>, <a href="/search/cs?searchtype=author&query=Wu"> Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07832v1-abstract-short" style="display: inline;"> While memory-augmented neural networks (MANNs) offer an effective solution for few-shot learning (FSL) by integrating deep neural networks with external memory, the capacity requirements and energy overhead of data movement become enormous due to the large number of support vectors in many-class FSL scenarios. Various in-memory search solutions have emerged to improve the energy efficiency of MANN… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07832v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07832v1-abstract-full" style="display: none;"> While memory-augmented neural networks (MANNs) offer an effective solution for few-shot learning (FSL) by integrating deep neural networks with external memory, the capacity requirements and energy overhead of data movement become enormous due to the large number of support vectors in many-class FSL scenarios. Various in-memory search solutions have emerged to improve the energy efficiency of MANNs. NAND-based multi-bit content addressable memory (MCAM) is a promising option due to its high density and large capacity. Despite its potential, MCAM faces limitations such as a restricted number of word lines, limited quantization levels, and non-ideal effects like varying string currents and bottleneck effects, which lead to significant accuracy drops. To address these issues, we propose several innovative methods. First, the Multi-bit Thermometer Code (MTMC) leverages the extensive capacity of MCAM to enhance vector precision using cumulative encoding rules, thereby mitigating the bottleneck effect. Second, the Asymmetric vector similarity search (AVSS) reduces the precision of the query vector while maintaining that of the support vectors, thereby minimizing the search iterations and improving efficiency in many-class scenarios. Finally, the Hardware-Aware Training (HAT) method optimizes controller training by modeling the hardware characteristics of MCAM, thus enhancing the reliability of the system. Our integrated framework reduces search iterations by up to 32 times, and increases overall accuracy by 1.58% to 6.94%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07832v1-abstract-full').style.display = 'none'; document.getElementById('2409.07832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07163">arXiv:2409.07163</a> <span> [<a href="https://arxiv.org/pdf/2409.07163">pdf</a>, <a href="https://arxiv.org/format/2409.07163">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mamba Policy: Towards Efficient 3D Diffusion Policy with Hybrid Selective State Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiahang Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jingkai Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaxu Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yulin Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yecheng Shao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wen Zhao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+G">Gang Han</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yijie Guo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Renjing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07163v1-abstract-short" style="display: inline;"> Diffusion models have been widely employed in the field of 3D manipulation due to their efficient capability to learn distributions, allowing for precise prediction of action trajectories. However, diffusion models typically rely on large parameter UNet backbones as policy networks, which can be challenging to deploy on resource-constrained devices. Recently, the Mamba model has emerged as a promi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07163v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07163v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07163v1-abstract-full" style="display: none;"> Diffusion models have been widely employed in the field of 3D manipulation due to their efficient capability to learn distributions, allowing for precise prediction of action trajectories. However, diffusion models typically rely on large parameter UNet backbones as policy networks, which can be challenging to deploy on resource-constrained devices. Recently, the Mamba model has emerged as a promising solution for efficient modeling, offering low computational complexity and strong performance in sequence modeling. In this work, we propose the Mamba Policy, a lighter but stronger policy that reduces the parameter count by over 80% compared to the original policy network while achieving superior performance. Specifically, we introduce the XMamba Block, which effectively integrates input information with conditional features and leverages a combination of Mamba and Attention mechanisms for deep feature extraction. Extensive experiments demonstrate that the Mamba Policy excels on the Adroit, Dexart, and MetaWorld datasets, requiring significantly fewer computational resources. Additionally, we highlight the Mamba Policy's enhanced robustness in long-horizon scenarios compared to baseline methods and explore the performance of various Mamba variants within the Mamba Policy framework. Our project page is in https://andycao1125.github.io/mamba_policy/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07163v1-abstract-full').style.display = 'none'; document.getElementById('2409.07163v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06741">arXiv:2409.06741</a> <span> [<a href="https://arxiv.org/pdf/2409.06741">pdf</a>, <a href="https://arxiv.org/format/2409.06741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generative AI for Requirements Engineering: A Systematic Literature Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haowei Cheng</a>, <a href="/search/cs?searchtype=author&query=Husen%2C+J+H">Jati H. Husen</a>, <a href="/search/cs?searchtype=author&query=Peralta%2C+S+R">Sien Reeve Peralta</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bowen Jiang</a>, <a href="/search/cs?searchtype=author&query=Yoshioka%2C+N">Nobukazu Yoshioka</a>, <a href="/search/cs?searchtype=author&query=Ubayashi%2C+N">Naoyasu Ubayashi</a>, <a href="/search/cs?searchtype=author&query=Washizaki%2C+H">Hironori Washizaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06741v1-abstract-short" style="display: inline;"> Context: Generative AI (GenAI) has emerged as a transformative tool in software engineering, with requirements engineering (RE) actively exploring its potential to revolutionize processes and outcomes. The integration of GenAI into RE presents both promising opportunities and significant challenges that necessitate systematic analysis and evaluation. Objective: This paper presents a comprehensive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06741v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06741v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06741v1-abstract-full" style="display: none;"> Context: Generative AI (GenAI) has emerged as a transformative tool in software engineering, with requirements engineering (RE) actively exploring its potential to revolutionize processes and outcomes. The integration of GenAI into RE presents both promising opportunities and significant challenges that necessitate systematic analysis and evaluation. Objective: This paper presents a comprehensive systematic literature review (SLR) analyzing state-of-the-art applications and innovative proposals leveraging GenAI in RE. It surveys studies focusing on the utilization of GenAI to enhance RE processes while identifying key challenges and opportunities in this rapidly evolving field. Method: A rigorous SLR methodology was used to analyze 27 carefully selected primary studies in-depth. The review examined research questions pertaining to the application of GenAI across various RE phases, the models and techniques used, and the challenges encountered in implementation and adoption. Results: The most salient findings include i) a predominant focus on the early stages of RE, particularly the elicitation and analysis of requirements, indicating potential for expansion into later phases; ii) the dominance of large language models, especially the GPT series, highlighting the need for diverse AI approaches; and iii) persistent challenges in domain-specific applications and the interpretability of AI-generated outputs, underscoring areas requiring further research and development. Conclusions: The results highlight the critical need for comprehensive evaluation frameworks, improved human-AI collaboration models, and thorough consideration of ethical implications in GenAI-assisted RE. Future research should prioritize extending GenAI applications across the entire RE lifecycle, enhancing domain-specific capabilities, and developing strategies for responsible AI integration in RE practices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06741v1-abstract-full').style.display = 'none'; document.getElementById('2409.06741v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02640">arXiv:2409.02640</a> <span> [<a href="https://arxiv.org/pdf/2409.02640">pdf</a>, <a href="https://arxiv.org/ps/2409.02640">ps</a>, <a href="https://arxiv.org/format/2409.02640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Linear Convergence in Hilbert's Projective Metric for Computing Augustin Information and a R茅nyi Information Measure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tsai%2C+C">Chung-En Tsai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guan-Ren Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao-Chung Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yen-Huan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02640v2-abstract-short" style="display: inline;"> Consider the problems of computing the Augustin information and a R茅nyi information measure of statistical independence, previously explored by Lapidoth and Pfister (IEEE Information Theory Workshop, 2018) and Tomamichel and Hayashi (IEEE Trans. Inf. Theory, 64(2):1064--1082, 2018). Both quantities are defined as solutions to optimization problems and lack closed-form expressions. This paper analy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02640v2-abstract-full').style.display = 'inline'; document.getElementById('2409.02640v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02640v2-abstract-full" style="display: none;"> Consider the problems of computing the Augustin information and a R茅nyi information measure of statistical independence, previously explored by Lapidoth and Pfister (IEEE Information Theory Workshop, 2018) and Tomamichel and Hayashi (IEEE Trans. Inf. Theory, 64(2):1064--1082, 2018). Both quantities are defined as solutions to optimization problems and lack closed-form expressions. This paper analyzes two iterative algorithms: Augustin's fixed-point iteration for computing the Augustin information, and the algorithm by Kamatsuka et al. (arXiv:2404.10950) for the R茅nyi information measure. Previously, it was only known that these algorithms converge asymptotically. We establish the linear convergence of Augustin's algorithm for the Augustin information of order $伪\in (1/2, 1) \cup (1, 3/2)$ and Kamatsuka et al.'s algorithm for the R茅nyi information measure of order $伪\in [1/2, 1) \cup (1, \infty)$, using Hilbert's projective metric. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02640v2-abstract-full').style.display = 'none'; document.getElementById('2409.02640v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, last sentence of the first paragraph and Eq. (2) corrected</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16467">arXiv:2408.16467</a> <span> [<a href="https://arxiv.org/pdf/2408.16467">pdf</a>, <a href="https://arxiv.org/format/2408.16467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spiking Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiahang Cao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanzhong Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqing Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Deming Zhou</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Renjing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16467v1-abstract-short" style="display: inline;"> Recent years have witnessed Spiking Neural Networks (SNNs) gaining attention for their ultra-low energy consumption and high biological plausibility compared with traditional Artificial Neural Networks (ANNs). Despite their distinguished properties, the application of SNNs in the computationally intensive field of image generation is still under exploration. In this paper, we propose the Spiking D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16467v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16467v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16467v1-abstract-full" style="display: none;"> Recent years have witnessed Spiking Neural Networks (SNNs) gaining attention for their ultra-low energy consumption and high biological plausibility compared with traditional Artificial Neural Networks (ANNs). Despite their distinguished properties, the application of SNNs in the computationally intensive field of image generation is still under exploration. In this paper, we propose the Spiking Diffusion Models (SDMs), an innovative family of SNN-based generative models that excel in producing high-quality samples with significantly reduced energy consumption. In particular, we propose a Temporal-wise Spiking Mechanism (TSM) that allows SNNs to capture more temporal features from a bio-plasticity perspective. In addition, we propose a threshold-guided strategy that can further improve the performances by up to 16.7% without any additional training. We also make the first attempt to use the ANN-SNN approach for SNN-based generation tasks. Extensive experimental results reveal that our approach not only exhibits comparable performance to its ANN counterpart with few spiking time steps, but also outperforms previous SNN-based generative models by a large margin. Moreover, we also demonstrate the high-quality generation ability of SDM on large-scale datasets, e.g., LSUN bedroom. This development marks a pivotal advancement in the capabilities of SNN-based generation, paving the way for future research avenues to realize low-energy and low-latency generative applications. Our code is available at https://github.com/AndyCao1125/SDM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16467v1-abstract-full').style.display = 'none'; document.getElementById('2408.16467v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Artificial Intelligence</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14074">arXiv:2408.14074</a> <span> [<a href="https://arxiv.org/pdf/2408.14074">pdf</a>, <a href="https://arxiv.org/format/2408.14074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Abstraction Engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bencomo%2C+N">Nelly Bencomo</a>, <a href="/search/cs?searchtype=author&query=Cabot%2C+J">Jordi Cabot</a>, <a href="/search/cs?searchtype=author&query=Chechik%2C+M">Marsha Chechik</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B+H+C">Betty H. C. Cheng</a>, <a href="/search/cs?searchtype=author&query=Combemale%2C+B">Benoit Combemale</a>, <a href="/search/cs?searchtype=author&query=W%C4%85sowski%2C+A">Andrzej W膮sowski</a>, <a href="/search/cs?searchtype=author&query=Zschaler%2C+S">Steffen Zschaler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14074v1-abstract-short" style="display: inline;"> Modern software-based systems operate under rapidly changing conditions and face ever-increasing uncertainty. In response, systems are increasingly adaptive and reliant on artificial-intelligence methods. In addition to the ubiquity of software with respect to users and application areas (e.g., transportation, smart grids, medicine, etc.), these high-impact software systems necessarily draw from m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14074v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14074v1-abstract-full" style="display: none;"> Modern software-based systems operate under rapidly changing conditions and face ever-increasing uncertainty. In response, systems are increasingly adaptive and reliant on artificial-intelligence methods. In addition to the ubiquity of software with respect to users and application areas (e.g., transportation, smart grids, medicine, etc.), these high-impact software systems necessarily draw from many disciplines for foundational principles, domain expertise, and workflows. Recent progress with lowering the barrier to entry for coding has led to a broader community of developers, who are not necessarily software engineers. As such, the field of software engineering needs to adapt accordingly and offer new methods to systematically develop high-quality software systems by a broad range of experts and non-experts. This paper looks at these new challenges and proposes to address them through the lens of Abstraction. Abstraction is already used across many disciplines involved in software development -- from the time-honored classical deductive reasoning and formal modeling to the inductive reasoning employed by modern data science. The software engineering of the future requires Abstraction Engineering -- a systematic approach to abstraction across the inductive and deductive spaces. We discuss the foundations of Abstraction Engineering, identify key challenges, highlight the research questions that help address these challenges, and create a roadmap for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14074v1-abstract-full').style.display = 'none'; document.getElementById('2408.14074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12355">arXiv:2408.12355</a> <span> [<a href="https://arxiv.org/pdf/2408.12355">pdf</a>, <a href="https://arxiv.org/format/2408.12355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Class-balanced Open-set Semi-supervised Object Detection for Medical Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhanyun Lu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+R">Renshu Gu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Huimin Cheng</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+S">Siyu Pang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingyu Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Peifang Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Kinoshita%2C+Y">Yuichiro Kinoshita</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Juan Ye</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+G">Gangyong Jia</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qing Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12355v1-abstract-short" style="display: inline;"> Medical image datasets in the real world are often unlabeled and imbalanced, and Semi-Supervised Object Detection (SSOD) can utilize unlabeled data to improve an object detector. However, existing approaches predominantly assumed that the unlabeled data and test data do not contain out-of-distribution (OOD) classes. The few open-set semi-supervised object detection methods have two weaknesses: fir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12355v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12355v1-abstract-full" style="display: none;"> Medical image datasets in the real world are often unlabeled and imbalanced, and Semi-Supervised Object Detection (SSOD) can utilize unlabeled data to improve an object detector. However, existing approaches predominantly assumed that the unlabeled data and test data do not contain out-of-distribution (OOD) classes. The few open-set semi-supervised object detection methods have two weaknesses: first, the class imbalance is not considered; second, the OOD instances are distinguished and simply discarded during pseudo-labeling. In this paper, we consider the open-set semi-supervised object detection problem which leverages unlabeled data that contain OOD classes to improve object detection for medical images. Our study incorporates two key innovations: Category Control Embed (CCE) and out-of-distribution Detection Fusion Classifier (OODFC). CCE is designed to tackle dataset imbalance by constructing a Foreground information Library, while OODFC tackles open-set challenges by integrating the ``unknown'' information into basic pseudo-labels. Our method outperforms the state-of-the-art SSOD performance, achieving a 4.25 mAP improvement on the public Parasite dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12355v1-abstract-full').style.display = 'none'; document.getElementById('2408.12355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08147">arXiv:2408.08147</a> <span> [<a href="https://arxiv.org/pdf/2408.08147">pdf</a>, <a href="https://arxiv.org/format/2408.08147">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> P/D-Serve: Serving Disaggregated Large Language Model at Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yibo Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tao Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huimin Lin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Mingyang Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peiyang Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yipeng Ma</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Yicheng Shan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zhengfan Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cailong Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yajing Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tiandeng Wu</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xing Chu</a>, <a href="/search/cs?searchtype=author&query=Huan%2C+R">Ruizhi Huan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Li Ma</a>, <a href="/search/cs?searchtype=author&query=You%2C+X">Xiao You</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenting Zhou</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yunpeng Ye</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wen Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiangkun Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongsheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+T">Tiantian Dong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiawei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+X">Xijian Ju</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jianxun Song</a> , et al. (5 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08147v1-abstract-short" style="display: inline;"> Serving disaggregated large language models (LLMs) over tens of thousands of xPU devices (GPUs or NPUs) with reliable performance faces multiple challenges. 1) Ignoring the diversity (various prefixes and tidal requests), treating all the prompts in a mixed pool is inadequate. To facilitate the similarity per scenario and minimize the inner mismatch on P/D (prefill and decoding) processing, fine-g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08147v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08147v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08147v1-abstract-full" style="display: none;"> Serving disaggregated large language models (LLMs) over tens of thousands of xPU devices (GPUs or NPUs) with reliable performance faces multiple challenges. 1) Ignoring the diversity (various prefixes and tidal requests), treating all the prompts in a mixed pool is inadequate. To facilitate the similarity per scenario and minimize the inner mismatch on P/D (prefill and decoding) processing, fine-grained organization is required, dynamically adjusting P/D ratios for better performance. 2) Due to inaccurate estimation on workload (queue status or maintained connections), the global scheduler easily incurs unnecessary timeouts in prefill. 3) Block-fixed device-to-device (D2D) KVCache transfer over cluster-level RDMA (remote direct memory access) fails to achieve desired D2D utilization as expected. To overcome previous problems, this paper proposes an end-to-end system P/D-Serve, complying with the paradigm of MLOps (machine learning operations), which models end-to-end (E2E) P/D performance and enables: 1) fine-grained P/D organization, mapping the service with RoCE (RDMA over converged ethernet) as needed, to facilitate similar processing and dynamic adjustments on P/D ratios; 2) on-demand forwarding upon rejections for idle prefill, decoupling the scheduler from regular inaccurate reports and local queues, to avoid timeouts in prefill; and 3) efficient KVCache transfer via optimized D2D access. P/D-Serve is implemented upon Ascend and MindSpore, has been deployed over tens of thousands of NPUs for more than eight months in commercial use, and further achieves 60\%, 42\% and 46\% improvements on E2E throughput, time-to-first-token (TTFT) SLO (service level objective) and D2D transfer time. As the E2E system with optimizations, P/D-Serve achieves 6.7x increase on throughput, compared with aggregated LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08147v1-abstract-full').style.display = 'none'; document.getElementById('2408.08147v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07611">arXiv:2408.07611</a> <span> [<a href="https://arxiv.org/pdf/2408.07611">pdf</a>, <a href="https://arxiv.org/format/2408.07611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation Integrating Web Search and Knowledge Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+W">Weijian Xie</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xuefeng Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuhui Liu</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+K">Kaihua Ni</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hong Cheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zetian Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07611v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have greatly contributed to the development of adaptive intelligent agents and are positioned as an important way to achieve Artificial General Intelligence (AGI). However, LLMs are prone to produce factually incorrect information and often produce "phantom" content that undermines their reliability, which poses a serious challenge for their deployment in real-world sc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07611v2-abstract-full').style.display = 'inline'; document.getElementById('2408.07611v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07611v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have greatly contributed to the development of adaptive intelligent agents and are positioned as an important way to achieve Artificial General Intelligence (AGI). However, LLMs are prone to produce factually incorrect information and often produce "phantom" content that undermines their reliability, which poses a serious challenge for their deployment in real-world scenarios. Enhancing LLMs by combining external databases and information retrieval mechanisms is an effective path. To address the above challenges, we propose a new approach called WeKnow-RAG, which integrates Web search and Knowledge Graphs into a "Retrieval-Augmented Generation (RAG)" system. First, the accuracy and reliability of LLM responses are improved by combining the structured representation of Knowledge Graphs with the flexibility of dense vector retrieval. WeKnow-RAG then utilizes domain-specific knowledge graphs to satisfy a variety of queries and domains, thereby improving performance on factual information and complex reasoning tasks by employing multi-stage web page retrieval techniques using both sparse and dense retrieval methods. Our approach effectively balances the efficiency and accuracy of information retrieval, thus improving the overall retrieval process. Finally, we also integrate a self-assessment mechanism for the LLM to evaluate the trustworthiness of the answers it generates. Our approach proves its outstanding effectiveness in a wide range of offline experiments and online submissions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07611v2-abstract-full').style.display = 'none'; document.getElementById('2408.07611v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta KDD Cup 2024 CRAG Challenge</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06922">arXiv:2408.06922</a> <span> [<a href="https://arxiv.org/pdf/2408.06922">pdf</a>, <a href="https://arxiv.org/format/2408.06922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Temporal Variability and Multi-Viewed Self-Supervised Representations to Tackle the ASVspoof5 Deepfake Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haonan Cheng</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+L">Long Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06922v1-abstract-short" style="display: inline;"> ASVspoof5, the fifth edition of the ASVspoof series, is one of the largest global audio security challenges. It aims to advance the development of countermeasure (CM) to discriminate bonafide and spoofed speech utterances. In this paper, we focus on addressing the problem of open-domain audio deepfake detection, which corresponds directly to the ASVspoof5 Track1 open condition. At first, we compre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06922v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06922v1-abstract-full" style="display: none;"> ASVspoof5, the fifth edition of the ASVspoof series, is one of the largest global audio security challenges. It aims to advance the development of countermeasure (CM) to discriminate bonafide and spoofed speech utterances. In this paper, we focus on addressing the problem of open-domain audio deepfake detection, which corresponds directly to the ASVspoof5 Track1 open condition. At first, we comprehensively investigate various CM on ASVspoof5, including data expansion, data augmentation, and self-supervised learning (SSL) features. Due to the high-frequency gaps characteristic of the ASVspoof5 dataset, we introduce Frequency Mask, a data augmentation method that masks specific frequency bands to improve CM robustness. Combining various scale of temporal information with multiple SSL features, our experiments achieved a minDCF of 0.0158 and an EER of 0.55% on the ASVspoof 5 Track 1 evaluation progress set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06922v1-abstract-full').style.display = 'none'; document.getElementById('2408.06922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06569">arXiv:2408.06569</a> <span> [<a href="https://arxiv.org/pdf/2408.06569">pdf</a>, <a href="https://arxiv.org/format/2408.06569">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Social Debiasing for Fair Multi-modal LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Harry Cheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yangyang Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qingpei Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming Yang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+T">Tian Gan</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+L">Liqiang Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06569v1-abstract-short" style="display: inline;"> Multi-modal Large Language Models (MLLMs) have advanced significantly, offering powerful vision-language understanding capabilities. However, these models often inherit severe social biases from their training datasets, leading to unfair predictions based on attributes like race and gender. This paper addresses the issue of social biases in MLLMs by i) Introducing a comprehensive Counterfactual da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06569v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06569v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06569v1-abstract-full" style="display: none;"> Multi-modal Large Language Models (MLLMs) have advanced significantly, offering powerful vision-language understanding capabilities. However, these models often inherit severe social biases from their training datasets, leading to unfair predictions based on attributes like race and gender. This paper addresses the issue of social biases in MLLMs by i) Introducing a comprehensive Counterfactual dataset with Multiple Social Concepts (CMSC), which provides a more diverse and extensive training set compared to existing datasets. ii) Proposing an Anti-Stereotype Debiasing strategy (ASD). Our method works by revisiting the MLLM training process, rescaling the autoregressive loss function, and improving data sampling methods to counteract biases. Through extensive experiments on various MLLMs, our CMSC dataset and ASD method demonstrate a significant reduction in social biases while maintaining the models' original performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06569v1-abstract-full').style.display = 'none'; document.getElementById('2408.06569v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16564">arXiv:2407.16564</a> <span> [<a href="https://arxiv.org/pdf/2407.16564">pdf</a>, <a href="https://arxiv.org/format/2407.16564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio Prompt Adapter: Unleashing Music Editing Abilities for Text-to-Music with Lightweight Finetuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tsai%2C+F">Fang-Duo Tsai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shih-Lun Wu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Haven Kim</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo-Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao-Chung Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi-Hsuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16564v2-abstract-short" style="display: inline;"> Text-to-music models allow users to generate nearly realistic musical audio with textual commands. However, editing music audios remains challenging due to the conflicting desiderata of performing fine-grained alterations on the audio while maintaining a simple user interface. To address this challenge, we propose Audio Prompt Adapter (or AP-Adapter), a lightweight addition to pretrained text-to-m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16564v2-abstract-full').style.display = 'inline'; document.getElementById('2407.16564v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16564v2-abstract-full" style="display: none;"> Text-to-music models allow users to generate nearly realistic musical audio with textual commands. However, editing music audios remains challenging due to the conflicting desiderata of performing fine-grained alterations on the audio while maintaining a simple user interface. To address this challenge, we propose Audio Prompt Adapter (or AP-Adapter), a lightweight addition to pretrained text-to-music models. We utilize AudioMAE to extract features from the input audio, and construct attention-based adapters to feedthese features into the internal layers of AudioLDM2, a diffusion-based text-to-music model. With 22M trainable parameters, AP-Adapter empowers users to harness both global (e.g., genre and timbre) and local (e.g., melody) aspects of music, using the original audio and a short text as inputs. Through objective and subjective studies, we evaluate AP-Adapter on three tasks: timbre transfer, genre transfer, and accompaniment generation. Additionally, we demonstrate its effectiveness on out-of-domain audios containing unseen instruments during training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16564v2-abstract-full').style.display = 'none'; document.getElementById('2407.16564v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 25th International Society for Music Information Retrieval (ISMIR)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15060">arXiv:2407.15060</a> <span> [<a href="https://arxiv.org/pdf/2407.15060">pdf</a>, <a href="https://arxiv.org/format/2407.15060">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MusiConGen: Rhythm and Chord Control for Transformer-Based Text-to-Music Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lan%2C+Y">Yun-Han Lan</a>, <a href="/search/cs?searchtype=author&query=Hsiao%2C+W">Wen-Yi Hsiao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao-Chung Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi-Hsuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15060v1-abstract-short" style="display: inline;"> Existing text-to-music models can produce high-quality audio with great diversity. However, textual prompts alone cannot precisely control temporal musical features such as chords and rhythm of the generated music. To address this challenge, we introduce MusiConGen, a temporally-conditioned Transformer-based text-to-music model that builds upon the pretrained MusicGen framework. Our innovation lie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15060v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15060v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15060v1-abstract-full" style="display: none;"> Existing text-to-music models can produce high-quality audio with great diversity. However, textual prompts alone cannot precisely control temporal musical features such as chords and rhythm of the generated music. To address this challenge, we introduce MusiConGen, a temporally-conditioned Transformer-based text-to-music model that builds upon the pretrained MusicGen framework. Our innovation lies in an efficient finetuning mechanism, tailored for consumer-grade GPUs, that integrates automatically-extracted rhythm and chords as the condition signal. During inference, the condition can either be musical features extracted from a reference audio signal, or be user-defined symbolic chord sequence, BPM, and textual prompts. Our performance evaluation on two datasets -- one derived from extracted features and the other from user-created inputs -- demonstrates that MusiConGen can generate realistic backing track music that aligns well with the specified conditions. We open-source the code and model checkpoints, and provide audio examples online, https://musicongen.github.io/musicongen_demo/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15060v1-abstract-full').style.display = 'none'; document.getElementById('2407.15060v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 25th International Society for Music Information Retrieval (ISMIR)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14920">arXiv:2407.14920</a> <span> [<a href="https://arxiv.org/pdf/2407.14920">pdf</a>, <a href="https://arxiv.org/format/2407.14920">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RoIPoly: Vectorized Building Outline Extraction Using Vertex and Logit Embeddings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiao%2C+W">Weiqin Jiao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Persello%2C+C">Claudio Persello</a>, <a href="/search/cs?searchtype=author&query=Vosselman%2C+G">George Vosselman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14920v1-abstract-short" style="display: inline;"> Polygonal building outlines are crucial for geographic and cartographic applications. The existing approaches for outline extraction from aerial or satellite imagery are typically decomposed into subtasks, e.g., building masking and vectorization, or treat this task as a sequence-to-sequence prediction of ordered vertices. The former lacks efficiency, and the latter often generates redundant verti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14920v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14920v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14920v1-abstract-full" style="display: none;"> Polygonal building outlines are crucial for geographic and cartographic applications. The existing approaches for outline extraction from aerial or satellite imagery are typically decomposed into subtasks, e.g., building masking and vectorization, or treat this task as a sequence-to-sequence prediction of ordered vertices. The former lacks efficiency, and the latter often generates redundant vertices, both resulting in suboptimal performance. To handle these issues, we propose a novel Region-of-Interest (RoI) query-based approach called RoIPoly. Specifically, we formulate each vertex as a query and constrain the query attention on the most relevant regions of a potential building, yielding reduced computational overhead and more efficient vertex level interaction. Moreover, we introduce a novel learnable logit embedding to facilitate vertex classification on the attention map; thus, no post-processing is needed for redundant vertex removal. We evaluated our method on the vectorized building outline extraction dataset CrowdAI and the 2D floorplan reconstruction dataset Structured3D. On the CrowdAI dataset, RoIPoly with a ResNet50 backbone outperforms existing methods with the same or better backbones on most MS-COCO metrics, especially on small buildings, and achieves competitive results in polygon quality and vertex redundancy without any post-processing. On the Structured3D dataset, our method achieves the second-best performance on most metrics among existing methods dedicated to 2D floorplan reconstruction, demonstrating our cross-domain generalization capability. The code will be released upon acceptance of this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14920v1-abstract-full').style.display = 'none'; document.getElementById('2407.14920v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13133">arXiv:2407.13133</a> <span> [<a href="https://arxiv.org/pdf/2407.13133">pdf</a>, <a href="https://arxiv.org/format/2407.13133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FocusDiffuser: Perceiving Local Disparities for Camouflaged Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jianwei Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Q">Qiang Zhai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+A">Ao Luo</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Z">Zicheng Jiao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hong Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13133v1-abstract-short" style="display: inline;"> Detecting objects seamlessly blended into their surroundings represents a complex task for both human cognitive capabilities and advanced artificial intelligence algorithms. Currently, the majority of methodologies for detecting camouflaged objects mainly focus on utilizing discriminative models with various unique designs. However, it has been observed that generative models, such as Stable Diffu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13133v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13133v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13133v1-abstract-full" style="display: none;"> Detecting objects seamlessly blended into their surroundings represents a complex task for both human cognitive capabilities and advanced artificial intelligence algorithms. Currently, the majority of methodologies for detecting camouflaged objects mainly focus on utilizing discriminative models with various unique designs. However, it has been observed that generative models, such as Stable Diffusion, possess stronger capabilities for understanding various objects in complex environments; Yet their potential for the cognition and detection of camouflaged objects has not been extensively explored. In this study, we present a novel denoising diffusion model, namely FocusDiffuser, to investigate how generative models can enhance the detection and interpretation of camouflaged objects. We believe that the secret to spotting camouflaged objects lies in catching the subtle nuances in details. Consequently, our FocusDiffuser innovatively integrates specialized enhancements, notably the Boundary-Driven LookUp (BDLU) module and Cyclic Positioning (CP) module, to elevate standard diffusion models, significantly boosting the detail-oriented analytical capabilities. Our experiments demonstrate that FocusDiffuser, from a generative perspective, effectively addresses the challenge of camouflaged object detection, surpassing leading models on benchmarks like CAMO, COD10K and NC4K. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13133v1-abstract-full').style.display = 'none'; document.getElementById('2407.13133v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages,7figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11717">arXiv:2407.11717</a> <span> [<a href="https://arxiv.org/pdf/2407.11717">pdf</a>, <a href="https://arxiv.org/format/2407.11717">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Turbo: Informativity-Driven Acceleration Plug-In for Vision-Language Large Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ju%2C+C">Chen Ju</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haozhe Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Z">Zhonghua Zhai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weilin Huang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jinsong Lan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shuai Xiao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11717v1-abstract-short" style="display: inline;"> Vision-Language Large Models (VLMs) recently become primary backbone of AI, due to the impressive performance. However, their expensive computation costs, i.e., throughput and delay, impede potentials in the real-world scenarios. To achieve acceleration for VLMs, most existing methods focus on the model perspective: pruning, distillation, quantization, but completely overlook the data-perspective… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11717v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11717v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11717v1-abstract-full" style="display: none;"> Vision-Language Large Models (VLMs) recently become primary backbone of AI, due to the impressive performance. However, their expensive computation costs, i.e., throughput and delay, impede potentials in the real-world scenarios. To achieve acceleration for VLMs, most existing methods focus on the model perspective: pruning, distillation, quantization, but completely overlook the data-perspective redundancy. To fill the overlook, this paper pioneers the severity of data redundancy, and designs one plug-and-play Turbo module guided by information degree to prune inefficient tokens from visual or textual data. In pursuit of efficiency-performance trade-offs, information degree takes two crucial factors into consideration: mutual redundancy and semantic value. Concretely, the former evaluates data duplication between sequential tokens; while the latter evaluates each token by its contribution to the overall semantics. As a result, tokens with high information degree carry less redundancy and stronger semantics. For VLMs' calculation, Turbo works as a user-friendly plug-in that sorts data referring to information degree, utilizing only top-level ones to save costs. Its advantages are multifaceted, e.g., being generally compatible to various VLMs across understanding and generation, simple use without re-training and trivial engineering efforts. On multiple VLMs benchmarks, we fully experiment to demonstrate the good acceleration of Turbo, under negligible performance drop. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11717v1-abstract-full').style.display = 'none'; document.getElementById('2407.11717v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024. The first two authors share the same contribution. arXiv admin note: substantial text overlap with arXiv:2312.07408</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11681">arXiv:2407.11681</a> <span> [<a href="https://arxiv.org/pdf/2407.11681">pdf</a>, <a href="https://arxiv.org/format/2407.11681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MINI-LLM: Memory-Efficient Structured Pruning for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hongrong Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J+Q">Javen Qinfeng Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11681v1-abstract-short" style="display: inline;"> As Large Language Models (LLMs) grow dramatically in size, there is an increasing trend in compressing and speeding up these models. Previous studies have highlighted the usefulness of gradients for importance scoring in neural network compressing, especially in pruning medium-size networks. However, the substantial memory requirements involved in calculating gradients with backpropagation impede… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11681v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11681v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11681v1-abstract-full" style="display: none;"> As Large Language Models (LLMs) grow dramatically in size, there is an increasing trend in compressing and speeding up these models. Previous studies have highlighted the usefulness of gradients for importance scoring in neural network compressing, especially in pruning medium-size networks. However, the substantial memory requirements involved in calculating gradients with backpropagation impede the utilization of gradients in guiding LLM pruning. As a result, most pruning strategies for LLMs rely on gradient-free criteria, such as weight magnitudes or a mix of magnitudes and activations. In this paper, we devise a hybrid pruning criterion, which appropriately integrates magnitude, activation, and gradient to capitalize on feature map sensitivity for pruning LLMs. To overcome memory requirement barriers, we estimate gradients using only forward passes. Based on this, we propose a Memory-effIcieNt structured prunIng procedure for LLMs (MINI-LLM) to remove no-critical channels and multi-attention heads. Experimental results demonstrate the superior performance of MINI-LLM over existing gradient-free methods on three LLMs: LLaMA, BLOOM, and OPT across various downstream tasks (classification, multiple-choice, and generation), while MINI-LLM maintains a GPU memory footprint akin to gradient-free methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11681v1-abstract-full').style.display = 'none'; document.getElementById('2407.11681v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11306">arXiv:2407.11306</a> <span> [<a href="https://arxiv.org/pdf/2407.11306">pdf</a>, <a href="https://arxiv.org/format/2407.11306">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PADRe: A Unifying Polynomial Attention Drop-in Replacement for Efficient Vision Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Letourneau%2C+P">Pierre-David Letourneau</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+M+K">Manish Kumar Singh</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hsin-Pai Cheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shizhong Han</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yunxiao Shi</a>, <a href="/search/cs?searchtype=author&query=Jones%2C+D">Dalton Jones</a>, <a href="/search/cs?searchtype=author&query=Langston%2C+M+H">Matthew Harper Langston</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Hong Cai</a>, <a href="/search/cs?searchtype=author&query=Porikli%2C+F">Fatih Porikli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11306v1-abstract-short" style="display: inline;"> We present Polynomial Attention Drop-in Replacement (PADRe), a novel and unifying framework designed to replace the conventional self-attention mechanism in transformer models. Notably, several recent alternative attention mechanisms, including Hyena, Mamba, SimA, Conv2Former, and Castling-ViT, can be viewed as specific instances of our PADRe framework. PADRe leverages polynomial functions and dra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11306v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11306v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11306v1-abstract-full" style="display: none;"> We present Polynomial Attention Drop-in Replacement (PADRe), a novel and unifying framework designed to replace the conventional self-attention mechanism in transformer models. Notably, several recent alternative attention mechanisms, including Hyena, Mamba, SimA, Conv2Former, and Castling-ViT, can be viewed as specific instances of our PADRe framework. PADRe leverages polynomial functions and draws upon established results from approximation theory, enhancing computational efficiency without compromising accuracy. PADRe's key components include multiplicative nonlinearities, which we implement using straightforward, hardware-friendly operations such as Hadamard products, incurring only linear computational and memory costs. PADRe further avoids the need for using complex functions such as Softmax, yet it maintains comparable or superior accuracy compared to traditional self-attention. We assess the effectiveness of PADRe as a drop-in replacement for self-attention across diverse computer vision tasks. These tasks include image classification, image-based 2D object detection, and 3D point cloud object detection. Empirical results demonstrate that PADRe runs significantly faster than the conventional self-attention (11x ~ 43x faster on server GPU and mobile NPU) while maintaining similar accuracy when substituting self-attention in the transformer models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11306v1-abstract-full').style.display = 'none'; document.getElementById('2407.11306v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09590">arXiv:2407.09590</a> <span> [<a href="https://arxiv.org/pdf/2407.09590">pdf</a>, <a href="https://arxiv.org/format/2407.09590">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Diversifying the Expert Knowledge for Task-Agnostic Pruning in Sparse Mixture-of-Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenliang Xu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09590v3-abstract-short" style="display: inline;"> By increasing model parameters but activating them sparsely when performing a task, the use of Mixture-of-Experts (MoE) architecture significantly improves the performance of Large Language Models (LLMs) without increasing the inference cost. However, the memory consumption due to the growing number of experts presents a challenge to the deployment of these models in many real world settings. Our… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09590v3-abstract-full').style.display = 'inline'; document.getElementById('2407.09590v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09590v3-abstract-full" style="display: none;"> By increasing model parameters but activating them sparsely when performing a task, the use of Mixture-of-Experts (MoE) architecture significantly improves the performance of Large Language Models (LLMs) without increasing the inference cost. However, the memory consumption due to the growing number of experts presents a challenge to the deployment of these models in many real world settings. Our empirical study reveals that some experts encode redundant knowledge during pre-training. We thus propose a method of grouping and pruning similar experts to improve the model's parameter efficiency. We validate the effectiveness of our method by pruning three state-of-the-art MoE architectures, including Mixtral, Deepseek-MoE, and Qwen. The evaluation shows that our method outperforms other model pruning methods on a range of natural language tasks. We will release our code to facilitate future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09590v3-abstract-full').style.display = 'none'; document.getElementById('2407.09590v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04969">arXiv:2407.04969</a> <span> [<a href="https://arxiv.org/pdf/2407.04969">pdf</a>, <a href="https://arxiv.org/format/2407.04969">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EVA-Score: Evaluating Abstractive Long-form Summarization on Informativeness through Extraction and Validation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yuchen Fan</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+X">Xin Zhong</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Y">Yazhe Wan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengsi Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haonan Cheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Gaoche Wu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04969v3-abstract-short" style="display: inline;"> Since LLMs emerged, more attention has been paid to abstractive long-form summarization, where longer input sequences indicate more information contained. Nevertheless, the automatic evaluation of such summaries remains underexplored. The current evaluation metrics for long-form summarization either use similarity-based metrics like ROUGE and BERTScore or LLM-based metrics using appropriate prompt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04969v3-abstract-full').style.display = 'inline'; document.getElementById('2407.04969v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04969v3-abstract-full" style="display: none;"> Since LLMs emerged, more attention has been paid to abstractive long-form summarization, where longer input sequences indicate more information contained. Nevertheless, the automatic evaluation of such summaries remains underexplored. The current evaluation metrics for long-form summarization either use similarity-based metrics like ROUGE and BERTScore or LLM-based metrics using appropriate prompts or pre-defined schema. We argue that the former only relies on similarity and fails to consider informativeness while the latter lacks quantitative analysis of informative richness, and is rather subjective and hard to explain. Current evaluation metrics either use traditional metrics like ROUGE and BERTScore, which rely on surface-level similarity and fail to consider informativeness, or simple LLM-based metrics, which are not robust and easily overwhelmed by the long contexts. In this paper, we propose a new evaluation metric called EVA-Score to extract all information from the given summaries, identify overlapped information based on reference, and calculate the information score. We test EVA-Score on several datasets and the experimental results reveal that EVA-Score shows the highest correlation with humans. We also re-evaluate the performance of LLMs on long-form summarization from the information perspective. The results indicate that responses of LLMs still have a gap with the human-written answers. Moreover, we provide a detailed analysis of the effectiveness of EVA-Score, forecasting future ways to automatically evaluate abstractive long-form summarization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04969v3-abstract-full').style.display = 'none'; document.getElementById('2407.04969v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository