Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,317 results for author: <span class="mathjax">He, X</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=He%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="He, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=He%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="He, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=He%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=He%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09181">arXiv:2411.09181</a> <span> [<a href="https://arxiv.org/pdf/2411.09181">pdf</a>, <a href="https://arxiv.org/ps/2411.09181">ps</a>, <a href="https://arxiv.org/format/2411.09181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeBaTeR: Denoising Bipartite Temporal Graph for Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xinyu He</a>, <a href="/search/cs?searchtype=author&query=Sepulveda%2C+J">Jose Sepulveda</a>, <a href="/search/cs?searchtype=author&query=Rahmani%2C+M">Mostafa Rahmani</a>, <a href="/search/cs?searchtype=author&query=Woo%2C+A">Alyssa Woo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hanghang Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09181v1-abstract-short" style="display: inline;"> Due to the difficulty of acquiring large-scale explicit user feedback, implicit feedback (e.g., clicks or other interactions) is widely applied as an alternative source of data, where user-item interactions can be modeled as a bipartite graph. Due to the noisy and biased nature of implicit real-world user-item interactions, identifying and rectifying noisy interactions are vital to enhance model p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09181v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09181v1-abstract-full" style="display: none;"> Due to the difficulty of acquiring large-scale explicit user feedback, implicit feedback (e.g., clicks or other interactions) is widely applied as an alternative source of data, where user-item interactions can be modeled as a bipartite graph. Due to the noisy and biased nature of implicit real-world user-item interactions, identifying and rectifying noisy interactions are vital to enhance model performance and robustness. Previous works on purifying user-item interactions in collaborative filtering mainly focus on mining the correlation between user/item embeddings and noisy interactions, neglecting the benefit of temporal patterns in determining noisy interactions. Time information, while enhancing the model utility, also bears its natural advantage in helping to determine noisy edges, e.g., if someone usually watches horror movies at night and talk shows in the morning, a record of watching a horror movie in the morning is more likely to be noisy interaction. Armed with this observation, we introduce a simple yet effective mechanism for generating time-aware user/item embeddings and propose two strategies for denoising bipartite temporal graph in recommender systems (DeBaTeR): the first is through reweighting the adjacency matrix (DeBaTeR-A), where a reliability score is defined to reweight the edges through both soft assignment and hard assignment; the second is through reweighting the loss function (DeBaTeR-L), where weights are generated to reweight user-item samples in the losses. Extensive experiments have been conducted to demonstrate the efficacy of our methods and illustrate how time information indeed helps identifying noisy edges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09181v1-abstract-full').style.display = 'none'; document.getElementById('2411.09181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06714">arXiv:2411.06714</a> <span> [<a href="https://arxiv.org/pdf/2411.06714">pdf</a>, <a href="https://arxiv.org/format/2411.06714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DiffSR: Learning Radar Reflectivity Synthesis via Diffusion Model from Satellite Observations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xuming He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhiwang Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenlong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiqi Chen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06714v1-abstract-short" style="display: inline;"> Weather radar data synthesis can fill in data for areas where ground observations are missing. Existing methods often employ reconstruction-based approaches with MSE loss to reconstruct radar data from satellite observation. However, such methods lead to over-smoothing, which hinders the generation of high-frequency details or high-value observation areas associated with convective weather. To add… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06714v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06714v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06714v1-abstract-full" style="display: none;"> Weather radar data synthesis can fill in data for areas where ground observations are missing. Existing methods often employ reconstruction-based approaches with MSE loss to reconstruct radar data from satellite observation. However, such methods lead to over-smoothing, which hinders the generation of high-frequency details or high-value observation areas associated with convective weather. To address this issue, we propose a two-stage diffusion-based method called DiffSR. We first pre-train a reconstruction model on global-scale data to obtain radar estimation and then synthesize radar reflectivity by combining radar estimation results with satellite data as conditions for the diffusion model. Extensive experiments show that our method achieves state-of-the-art (SOTA) results, demonstrating the ability to generate high-frequency details and high-value areas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06714v1-abstract-full').style.display = 'none'; document.getElementById('2411.06714v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03829">arXiv:2411.03829</a> <span> [<a href="https://arxiv.org/pdf/2411.03829">pdf</a>, <a href="https://arxiv.org/format/2411.03829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generalize or Detect? Towards Robust Semantic Segmentation Under Multiple Distribution Shifts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhitong Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bingnan Li</a>, <a href="/search/cs?searchtype=author&query=Salzmann%2C+M">Mathieu Salzmann</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xuming He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03829v1-abstract-short" style="display: inline;"> In open-world scenarios, where both novel classes and domains may exist, an ideal segmentation model should detect anomaly classes for safety and generalize to new domains. However, existing methods often struggle to distinguish between domain-level and semantic-level distribution shifts, leading to poor out-of-distribution (OOD) detection or domain generalization performance. In this work, we aim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03829v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03829v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03829v1-abstract-full" style="display: none;"> In open-world scenarios, where both novel classes and domains may exist, an ideal segmentation model should detect anomaly classes for safety and generalize to new domains. However, existing methods often struggle to distinguish between domain-level and semantic-level distribution shifts, leading to poor out-of-distribution (OOD) detection or domain generalization performance. In this work, we aim to equip the model to generalize effectively to covariate-shift regions while precisely identifying semantic-shift regions. To achieve this, we design a novel generative augmentation method to produce coherent images that incorporate both anomaly (or novel) objects and various covariate shifts at both image and object levels. Furthermore, we introduce a training strategy that recalibrates uncertainty specifically for semantic shifts and enhances the feature extractor to align features associated with domain shifts. We validate the effectiveness of our method across benchmarks featuring both semantic and domain shifts. Our method achieves state-of-the-art performance across all benchmarks for both OOD detection and domain generalization. Code is available at https://github.com/gaozhitong/MultiShiftSeg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03829v1-abstract-full').style.display = 'none'; document.getElementById('2411.03829v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02461">arXiv:2411.02461</a> <span> [<a href="https://arxiv.org/pdf/2411.02461">pdf</a>, <a href="https://arxiv.org/format/2411.02461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Multiple Dimensions of Trustworthiness in LLMs via Sparse Activation Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yuxin Xiao</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Chaoqun Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yonggang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xu Shen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02461v1-abstract-short" style="display: inline;"> As the development and application of Large Language Models (LLMs) continue to advance rapidly, enhancing their trustworthiness and aligning them with human preferences has become a critical area of research. Traditional methods rely heavily on extensive data for Reinforcement Learning from Human Feedback (RLHF), but representation engineering offers a new, training-free approach. This technique l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02461v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02461v1-abstract-full" style="display: none;"> As the development and application of Large Language Models (LLMs) continue to advance rapidly, enhancing their trustworthiness and aligning them with human preferences has become a critical area of research. Traditional methods rely heavily on extensive data for Reinforcement Learning from Human Feedback (RLHF), but representation engineering offers a new, training-free approach. This technique leverages semantic features to control the representation of LLM's intermediate hidden states, enabling the model to meet specific requirements such as increased honesty or heightened safety awareness. However, a significant challenge arises when attempting to fulfill multiple requirements simultaneously. It proves difficult to encode various semantic contents, like honesty and safety, into a singular semantic feature, restricting its practicality. In this work, we address this issue through ``Sparse Activation Control''. By delving into the intrinsic mechanisms of LLMs, we manage to identify and pinpoint components that are closely related to specific tasks within the model, i.e., attention heads. These heads display sparse characteristics that allow for near-independent control over different tasks. Our experiments, conducted on the open-source Llama series models, have yielded encouraging results. The models were able to align with human preferences on issues of safety, factuality, and bias concurrently. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02461v1-abstract-full').style.display = 'none'; document.getElementById('2411.02461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01822">arXiv:2411.01822</a> <span> [<a href="https://arxiv.org/pdf/2411.01822">pdf</a>, <a href="https://arxiv.org/format/2411.01822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Distribution alignment based transfer fusion frameworks on quantum devices for seeking quantum advantages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xi He</a>, <a href="/search/cs?searchtype=author&query=Du%2C+F">Feiyu Du</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaohan Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+T">Tao Lei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01822v1-abstract-short" style="display: inline;"> The scarcity of labelled data is specifically an urgent challenge in the field of quantum machine learning (QML). Two transfer fusion frameworks are proposed in this paper to predict the labels of a target domain data by aligning its distribution to a different but related labelled source domain on quantum devices. The frameworks fuses the quantum data from two different, but related domains throu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01822v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01822v1-abstract-full" style="display: none;"> The scarcity of labelled data is specifically an urgent challenge in the field of quantum machine learning (QML). Two transfer fusion frameworks are proposed in this paper to predict the labels of a target domain data by aligning its distribution to a different but related labelled source domain on quantum devices. The frameworks fuses the quantum data from two different, but related domains through a quantum information infusion channel. The predicting tasks in the target domain can be achieved with quantum advantages by post-processing quantum measurement results. One framework, the quantum basic linear algebra subroutines (QBLAS) based implementation, can theoretically achieve the procedure of transfer fusion with quadratic speedup on a universal quantum computer. In addition, the other framework, a hardware-scalable architecture, is implemented on the noisy intermediate-scale quantum (NISQ) devices through a variational hybrid quantum-classical procedure. Numerical experiments on the synthetic and handwritten digits datasets demonstrate that the variatioinal transfer fusion (TF) framework can reach state-of-the-art (SOTA) quantum DA method performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01822v1-abstract-full').style.display = 'none'; document.getElementById('2411.01822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01690">arXiv:2411.01690</a> <span> [<a href="https://arxiv.org/pdf/2411.01690">pdf</a>, <a href="https://arxiv.org/format/2411.01690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3589334.3645626">10.1145/3589334.3645626 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Co-clustering for Federated Recommender System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xinrui He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuo Liu</a>, <a href="/search/cs?searchtype=author&query=Keung%2C+J">Jackey Keung</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jingrui He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01690v1-abstract-short" style="display: inline;"> As data privacy and security attract increasing attention, Federated Recommender System (FRS) offers a solution that strikes a balance between providing high-quality recommendations and preserving user privacy. However, the presence of statistical heterogeneity in FRS, commonly observed due to personalized decision-making patterns, can pose challenges. To address this issue and maximize the benefi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01690v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01690v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01690v1-abstract-full" style="display: none;"> As data privacy and security attract increasing attention, Federated Recommender System (FRS) offers a solution that strikes a balance between providing high-quality recommendations and preserving user privacy. However, the presence of statistical heterogeneity in FRS, commonly observed due to personalized decision-making patterns, can pose challenges. To address this issue and maximize the benefit of collaborative filtering (CF) in FRS, it is intuitive to consider clustering clients (users) as well as items into different groups and learning group-specific models. Existing methods either resort to client clustering via user representations-risking privacy leakage, or employ classical clustering strategies on item embeddings or gradients, which we found are plagued by the curse of dimensionality. In this paper, we delve into the inefficiencies of the K-Means method in client grouping, attributing failures due to the high dimensionality as well as data sparsity occurring in FRS, and propose CoFedRec, a novel Co-clustering Federated Recommendation mechanism, to address clients heterogeneity and enhance the collaborative filtering within the federated framework. Specifically, the server initially formulates an item membership from the client-provided item networks. Subsequently, clients are grouped regarding a specific item category picked from the item membership during each communication round, resulting in an intelligently aggregated group model. Meanwhile, to comprehensively capture the global inter-relationships among items, we incorporate an additional supervised contrastive learning term based on the server-side generated item membership into the local training phase for each client. Extensive experiments on four datasets are provided, which verify the effectiveness of the proposed CoFedRec. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01690v1-abstract-full').style.display = 'none'; document.getElementById('2411.01690v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WWW '24: Proceedings of the ACM Web Conference 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01656">arXiv:2411.01656</a> <span> [<a href="https://arxiv.org/pdf/2411.01656">pdf</a>, <a href="https://arxiv.org/format/2411.01656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Degradation-Aware Residual-Conditioned Optimal Transport for Unified Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiaole Tang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xiang Gu</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaoyi He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xin Hu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jian Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01656v1-abstract-short" style="display: inline;"> All-in-one image restoration has emerged as a practical and promising low-level vision task for real-world applications. In this context, the key issue lies in how to deal with different types of degraded images simultaneously. In this work, we present a Degradation-Aware Residual-Conditioned Optimal Transport (DA-RCOT) approach that models (all-in-one) image restoration as an optimal transport (O… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01656v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01656v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01656v1-abstract-full" style="display: none;"> All-in-one image restoration has emerged as a practical and promising low-level vision task for real-world applications. In this context, the key issue lies in how to deal with different types of degraded images simultaneously. In this work, we present a Degradation-Aware Residual-Conditioned Optimal Transport (DA-RCOT) approach that models (all-in-one) image restoration as an optimal transport (OT) problem for unpaired and paired settings, introducing the transport residual as a degradation-specific cue for both the transport cost and the transport map. Specifically, we formalize image restoration with a residual-guided OT objective by exploiting the degradation-specific patterns of the Fourier residual in the transport cost. More crucially, we design the transport map for restoration as a two-pass DA-RCOT map, in which the transport residual is computed in the first pass and then encoded as multi-scale residual embeddings to condition the second-pass restoration. This conditioning process injects intrinsic degradation knowledge (e.g., degradation type and level) and structural information from the multi-scale residual embeddings into the OT map, which thereby can dynamically adjust its behaviors for all-in-one restoration. Extensive experiments across five degradations demonstrate the favorable performance of DA-RCOT as compared to state-of-the-art methods, in terms of distortion measures, perceptual quality, and image structure preservation. Notably, DA-RCOT delivers superior adaptability to real-world scenarios even with multiple degradations and shows distinctive robustness to both degradation levels and the number of degradations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01656v1-abstract-full').style.display = 'none'; document.getElementById('2411.01656v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00769">arXiv:2411.00769</a> <span> [<a href="https://arxiv.org/pdf/2411.00769">pdf</a>, <a href="https://arxiv.org/format/2411.00769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GameGen-X: Interactive Open-world Game Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Che%2C+H">Haoxuan Che</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xuanhua He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Quande Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00769v1-abstract-short" style="display: inline;"> We introduce GameGen-X, the first diffusion transformer model specifically designed for both generating and interactively controlling open-world game videos. This model facilitates high-quality, open-domain generation by simulating an extensive array of game engine features, such as innovative characters, dynamic environments, complex actions, and diverse events. Additionally, it provides interact… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00769v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00769v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00769v1-abstract-full" style="display: none;"> We introduce GameGen-X, the first diffusion transformer model specifically designed for both generating and interactively controlling open-world game videos. This model facilitates high-quality, open-domain generation by simulating an extensive array of game engine features, such as innovative characters, dynamic environments, complex actions, and diverse events. Additionally, it provides interactive controllability, predicting and altering future content based on the current clip, thus allowing for gameplay simulation. To realize this vision, we first collected and built an Open-World Video Game Dataset from scratch. It is the first and largest dataset for open-world game video generation and control, which comprises over a million diverse gameplay video clips sampling from over 150 games with informative captions from GPT-4o. GameGen-X undergoes a two-stage training process, consisting of foundation model pre-training and instruction tuning. Firstly, the model was pre-trained via text-to-video generation and video continuation, endowing it with the capability for long-sequence, high-quality open-domain game video generation. Further, to achieve interactive controllability, we designed InstructNet to incorporate game-related multi-modal control signal experts. This allows the model to adjust latent representations based on user inputs, unifying character interaction and scene content control for the first time in video generation. During instruction tuning, only the InstructNet is updated while the pre-trained foundation model is frozen, enabling the integration of interactive controllability without loss of diversity and quality of generated video content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00769v1-abstract-full').style.display = 'none'; document.getElementById('2411.00769v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://github.com/GameGen-X/GameGen-X</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00373">arXiv:2411.00373</a> <span> [<a href="https://arxiv.org/pdf/2411.00373">pdf</a>, <a href="https://arxiv.org/format/2411.00373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Discrete RIS Enhanced Space Shift Keying MIMO System via Reflecting Beamforming Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xusheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qingqing Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xinyuan He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Lexi Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yaxin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00373v1-abstract-short" style="display: inline;"> In this paper, a discrete reconfigurable intelligent surface (RIS)-assisted spatial shift keying (SSK) multiple-input multiple-output (MIMO) scheme is investigated, in which a direct link between the transmitter and the receiver is considered. To improve the reliability of the RIS-SSK-MIMO scheme, we formulate an objective function based on minimizing the average bit error probability (ABEP). Sinc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00373v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00373v1-abstract-full" style="display: none;"> In this paper, a discrete reconfigurable intelligent surface (RIS)-assisted spatial shift keying (SSK) multiple-input multiple-output (MIMO) scheme is investigated, in which a direct link between the transmitter and the receiver is considered. To improve the reliability of the RIS-SSK-MIMO scheme, we formulate an objective function based on minimizing the average bit error probability (ABEP). Since the reflecting phase shift of RIS is discrete, it is difficult to address this problem directly. To this end, we optimize the RIS phase shift to maximize the Euclidean distance between the minimum constellations by applying the successive convex approximation (SCA) and penaltyalternating optimization method. Simulation results verify the superiority of the proposed RIS-SSK-MIMO scheme and demonstrate the impact of the number of RIS elements, the number of phase quantization bits, and the number of receive and transmit antennas in terms of reliability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00373v1-abstract-full').style.display = 'none'; document.getElementById('2411.00373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23166">arXiv:2410.23166</a> <span> [<a href="https://arxiv.org/pdf/2410.23166">pdf</a>, <a href="https://arxiv.org/format/2410.23166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SciPIP: An LLM-based Scientific Paper Idea Proposer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lihui Gu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liye Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yunxiang Luo</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yi Dai</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Liang Xie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23166v1-abstract-short" style="display: inline;"> The exponential growth of knowledge and the increasing complexity of interdisciplinary research pose significant challenges for researchers, including information overload and difficulties in exploring novel ideas. The advancements in large language models (LLMs), such as GPT-4, have shown great potential in enhancing idea proposals, but how to effectively utilize large models for reasonable idea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23166v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23166v1-abstract-full" style="display: none;"> The exponential growth of knowledge and the increasing complexity of interdisciplinary research pose significant challenges for researchers, including information overload and difficulties in exploring novel ideas. The advancements in large language models (LLMs), such as GPT-4, have shown great potential in enhancing idea proposals, but how to effectively utilize large models for reasonable idea proposal has not been thoroughly explored. This paper proposes a scientific paper idea proposer (SciPIP). Based on a user-provided research background, SciPIP retrieves helpful papers from a literature database while leveraging the capabilities of LLMs to generate more novel and feasible ideas. To this end, 1) we construct a literature retrieval database, extracting lots of papers' multi-dimension information for fast access. Then, a literature retrieval method based on semantics, entity, and citation co-occurrences is proposed to search relevant literature from multiple aspects based on the user-provided background. 2) After literature retrieval, we introduce dual-path idea proposal strategies, where one path infers solutions from the retrieved literature and the other path generates original ideas through model brainstorming. We then combine the two to achieve a good balance between feasibility and originality. Through extensive experiments on the natural language processing (NLP) field, we demonstrate that SciPIP can retrieve citations similar to those of existing top conference papers and generate many ideas consistent with them. Additionally, we evaluate the originality of other ideas generated by SciPIP using large language models, further validating the effectiveness of our proposed method. The code and the database are released at https://github.com/cheerss/SciPIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23166v1-abstract-full').style.display = 'none'; document.getElementById('2410.23166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 5 figures, 19 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23136">arXiv:2410.23136</a> <span> [<a href="https://arxiv.org/pdf/2410.23136">pdf</a>, <a href="https://arxiv.org/format/2410.23136">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Personalization for LLM-based Recommendation with Customized In-Context Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+K">Keqin Bao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jizhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fuli Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23136v1-abstract-short" style="display: inline;"> Frequently updating Large Language Model (LLM)-based recommender systems to adapt to new user interests -- as done for traditional ones -- is impractical due to high training costs, even with acceleration methods. This work explores adapting to dynamic user interests without any model updates by leveraging In-Context Learning (ICL), which allows LLMs to learn new tasks from few-shot examples provi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23136v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23136v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23136v1-abstract-full" style="display: none;"> Frequently updating Large Language Model (LLM)-based recommender systems to adapt to new user interests -- as done for traditional ones -- is impractical due to high training costs, even with acceleration methods. This work explores adapting to dynamic user interests without any model updates by leveraging In-Context Learning (ICL), which allows LLMs to learn new tasks from few-shot examples provided in the input. Using new-interest examples as the ICL few-shot examples, LLMs may learn real-time interest directly, avoiding the need for model updates. However, existing LLM-based recommenders often lose the in-context learning ability during recommendation tuning, while the original LLM's in-context learning lacks recommendation-specific focus. To address this, we propose RecICL, which customizes recommendation-specific in-context learning for real-time recommendations. RecICL organizes training examples in an in-context learning format, ensuring that in-context learning ability is preserved and aligned with the recommendation task during tuning. Extensive experiments demonstrate RecICL's effectiveness in delivering real-time recommendations without requiring model updates. Our code is available at https://github.com/ym689/rec_icl. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23136v1-abstract-full').style.display = 'none'; document.getElementById('2410.23136v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21520">arXiv:2410.21520</a> <span> [<a href="https://arxiv.org/pdf/2410.21520">pdf</a>, <a href="https://arxiv.org/format/2410.21520">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM-Forest for Health Tabular Data Imputation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xinrui He</a>, <a href="/search/cs?searchtype=author&query=Ban%2C+Y">Yikun Ban</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">Jiaru Zou</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+T">Tianxin Wei</a>, <a href="/search/cs?searchtype=author&query=Cook%2C+C+B">Curtiss B. Cook</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jingrui He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21520v1-abstract-short" style="display: inline;"> Missing data imputation is a critical challenge in tabular datasets, especially in healthcare, where data completeness is vital for accurate analysis. Large language models (LLMs), trained on vast corpora, have shown strong potential in data generation, making them a promising tool for tabular data imputation. However, challenges persist in designing effective prompts for a finetuning-free process… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21520v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21520v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21520v1-abstract-full" style="display: none;"> Missing data imputation is a critical challenge in tabular datasets, especially in healthcare, where data completeness is vital for accurate analysis. Large language models (LLMs), trained on vast corpora, have shown strong potential in data generation, making them a promising tool for tabular data imputation. However, challenges persist in designing effective prompts for a finetuning-free process and in mitigating the risk of LLM hallucinations. To address these issues, we propose a novel framework, LLM-Forest, which introduces a "forest" of few-shot learning LLM "trees" with confidence-based weighted voting. This framework is established on a new concept of bipartite information graphs to identify high-quality relevant neighboring entries with both feature and value granularity. Extensive experiments on four real-world healthcare datasets demonstrate the effectiveness and efficiency of LLM-Forest. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21520v1-abstract-full').style.display = 'none'; document.getElementById('2410.21520v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20533">arXiv:2410.20533</a> <span> [<a href="https://arxiv.org/pdf/2410.20533">pdf</a>, <a href="https://arxiv.org/format/2410.20533">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Guiding Through Complexity: What Makes Good Supervision for Hard Reasoning Tasks? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xuan He</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Da Yin</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+N">Nanyun Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20533v2-abstract-short" style="display: inline;"> How can "weak teacher models" such as average human annotators or existing AI systems, effectively supervise LLMs to improve performance on hard reasoning tasks, especially those that challenge and requires expertise or daily practice from the teacher models? In this paper, we seek for empirical answers to this question by investigating various data-driven strategies that offer supervision data at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20533v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20533v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20533v2-abstract-full" style="display: none;"> How can "weak teacher models" such as average human annotators or existing AI systems, effectively supervise LLMs to improve performance on hard reasoning tasks, especially those that challenge and requires expertise or daily practice from the teacher models? In this paper, we seek for empirical answers to this question by investigating various data-driven strategies that offer supervision data at different quality levels upon tasks of varying complexity. Two intuitive strategies emerge for teacher models to provide supervision during alignment training: 1) using lower-quality supervision from complete tasks that match the difficulty of the target reasoning tasks, and 2) leveraging higher-quality supervision from easier subtasks that are less challenging. Interestingly, we find that even when the outcome error rate for hard task supervision is high (e.g., 90\%), training on such data can outperform perfectly correct supervision on easier subtasks on multiple hard math benchmarks. We further identify a more critical factor influencing training performance: step-wise error rates, which indicate the severity of errors in solutions. Specifically, training on hard task supervision with the same outcome error rates but disparate step-wise error rates can lead to a 30\% accuracy gap on MATH benchmark. Our results also reveal that supplementing hard task supervision with the corresponding subtask supervision can yield notable performance improvements than simply combining rephrased hard full task supervision, suggesting new avenues for data augmentation. Data and code are released at \url{https://github.com/hexuan21/Weak-to-Strong}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20533v2-abstract-full').style.display = 'none'; document.getElementById('2410.20533v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19406">arXiv:2410.19406</a> <span> [<a href="https://arxiv.org/pdf/2410.19406">pdf</a>, <a href="https://arxiv.org/format/2410.19406">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Auditing Test To Detect Behavioral Shift in Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Richter%2C+L">Leo Richter</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xuanli He</a>, <a href="/search/cs?searchtype=author&query=Minervini%2C+P">Pasquale Minervini</a>, <a href="/search/cs?searchtype=author&query=Kusner%2C+M+J">Matt J. Kusner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19406v1-abstract-short" style="display: inline;"> As language models (LMs) approach human-level performance, a comprehensive understanding of their behavior becomes crucial. This includes evaluating capabilities, biases, task performance, and alignment with societal values. Extensive initial evaluations, including red teaming and diverse benchmarking, can establish a model's behavioral profile. However, subsequent fine-tuning or deployment modifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19406v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19406v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19406v1-abstract-full" style="display: none;"> As language models (LMs) approach human-level performance, a comprehensive understanding of their behavior becomes crucial. This includes evaluating capabilities, biases, task performance, and alignment with societal values. Extensive initial evaluations, including red teaming and diverse benchmarking, can establish a model's behavioral profile. However, subsequent fine-tuning or deployment modifications may alter these behaviors in unintended ways. We present a method for continual Behavioral Shift Auditing (BSA) in LMs. Building on recent work in hypothesis testing, our auditing test detects behavioral shifts solely through model generations. Our test compares model generations from a baseline model to those of the model under scrutiny and provides theoretical guarantees for change detection while controlling false positives. The test features a configurable tolerance parameter that adjusts sensitivity to behavioral changes for different use cases. We evaluate our approach using two case studies: monitoring changes in (a) toxicity and (b) translation performance. We find that the test is able to detect meaningful changes in behavior distributions using just hundreds of examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19406v1-abstract-full').style.display = 'none'; document.getElementById('2410.19406v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17954">arXiv:2410.17954</a> <span> [<a href="https://arxiv.org/pdf/2410.17954">pdf</a>, <a href="https://arxiv.org/format/2410.17954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ExpertFlow: Optimized Expert Activation and Token Allocation for Efficient Mixture-of-Experts Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xin He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shunkang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxin Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Haiyan Yin</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zihao Zeng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Shaohuai Shi</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a>, <a href="/search/cs?searchtype=author&query=Tsang%2C+I">Ivor Tsang</a>, <a href="/search/cs?searchtype=author&query=Soon%2C+O+Y">Ong Yew Soon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17954v1-abstract-short" style="display: inline;"> Sparse Mixture of Experts (MoE) models, while outperforming dense Large Language Models (LLMs) in terms of performance, face significant deployment challenges during inference due to their high memory demands. Existing offloading techniques, which involve swapping activated and idle experts between the GPU and CPU, often suffer from rigid expert caching mechanisms. These mechanisms fail to adapt t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17954v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17954v1-abstract-full" style="display: none;"> Sparse Mixture of Experts (MoE) models, while outperforming dense Large Language Models (LLMs) in terms of performance, face significant deployment challenges during inference due to their high memory demands. Existing offloading techniques, which involve swapping activated and idle experts between the GPU and CPU, often suffer from rigid expert caching mechanisms. These mechanisms fail to adapt to dynamic routing, leading to inefficient cache utilization, or incur prohibitive costs for prediction training. To tackle these inference-specific challenges, we introduce ExpertFlow, a comprehensive system specifically designed to enhance inference efficiency by accommodating flexible routing and enabling efficient expert scheduling between CPU and GPU. This reduces overhead and boosts system performance. Central to our approach is a predictive routing path-based offloading mechanism that utilizes a lightweight predictor to accurately forecast routing paths before computation begins. This proactive strategy allows for real-time error correction in expert caching, significantly increasing cache hit ratios and reducing the frequency of expert transfers, thereby minimizing I/O overhead. Additionally, we implement a dynamic token scheduling strategy that optimizes MoE inference by rearranging input tokens across different batches. This method not only reduces the number of activated experts per batch but also improves computational efficiency. Our extensive experiments demonstrate that ExpertFlow achieves up to 93.72\% GPU memory savings and enhances inference speed by 2 to 10 times compared to baseline methods, highlighting its effectiveness and utility as a robust solution for resource-constrained inference scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17954v1-abstract-full').style.display = 'none'; document.getElementById('2410.17954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Mixture-of-Experts, Inference, Offloading</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16236">arXiv:2410.16236</a> <span> [<a href="https://arxiv.org/pdf/2410.16236">pdf</a>, <a href="https://arxiv.org/format/2410.16236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LLaVA-KD: A Framework of Distilling Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuxuan Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Haoyang He</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xinwei He</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+A">Ao Tong</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Zhenye Gan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengjie Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiang Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16236v2-abstract-short" style="display: inline;"> The success of Large Language Models (LLM) has led researchers to explore Multimodal Large Language Models (MLLM) for unified visual and linguistic understanding. However, the increasing model size and computational complexity of MLLM limit their use in resource-constrained environments. Small-scale MLLM (s-MLLM) aims to retain the capabilities of the large-scale model (l-MLLM) while reducing comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16236v2-abstract-full').style.display = 'inline'; document.getElementById('2410.16236v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16236v2-abstract-full" style="display: none;"> The success of Large Language Models (LLM) has led researchers to explore Multimodal Large Language Models (MLLM) for unified visual and linguistic understanding. However, the increasing model size and computational complexity of MLLM limit their use in resource-constrained environments. Small-scale MLLM (s-MLLM) aims to retain the capabilities of the large-scale model (l-MLLM) while reducing computational demands, but resulting in a significant decline in performance. To address the aforementioned issues, we propose a novel LLaVA-KD framework to transfer knowledge from l-MLLM to s-MLLM. Specifically, we introduce Multimodal Distillation (MDist) to minimize the divergence between the visual-textual output distributions of l-MLLM and s-MLLM, and Relation Distillation (RDist) to transfer l-MLLM's ability to model correlations between visual features. Additionally, we propose a three-stage training scheme to fully exploit the potential of s-MLLM: 1) Distilled Pre-Training to align visual-textual representations, 2) Supervised Fine-Tuning to equip the model with multimodal understanding, and 3) Distilled Fine-Tuning to further transfer l-MLLM capabilities. Our approach significantly improves performance without altering the small model's architecture. Extensive experiments and ablation studies validate the effectiveness of each proposed component. Code will be available at https://github.com/Fantasyele/LLaVA-KD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16236v2-abstract-full').style.display = 'none'; document.getElementById('2410.16236v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16146">arXiv:2410.16146</a> <span> [<a href="https://arxiv.org/pdf/2410.16146">pdf</a>, <a href="https://arxiv.org/format/2410.16146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Combating Frequency Simplicity-biased Learning for Domain Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xilin He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jingyu Hu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qinliang Lin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Cheng Luo</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Weicheng Xie</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Siyang Song</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+M+H">Muhammad Haris Khan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Linlin Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16146v1-abstract-short" style="display: inline;"> Domain generalization methods aim to learn transferable knowledge from source domains that can generalize well to unseen target domains. Recent studies show that neural networks frequently suffer from a simplicity-biased learning behavior which leads to over-reliance on specific frequency sets, namely as frequency shortcuts, instead of semantic information, resulting in poor generalization perform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16146v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16146v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16146v1-abstract-full" style="display: none;"> Domain generalization methods aim to learn transferable knowledge from source domains that can generalize well to unseen target domains. Recent studies show that neural networks frequently suffer from a simplicity-biased learning behavior which leads to over-reliance on specific frequency sets, namely as frequency shortcuts, instead of semantic information, resulting in poor generalization performance. Despite previous data augmentation techniques successfully enhancing generalization performances, they intend to apply more frequency shortcuts, thereby causing hallucinations of generalization improvement. In this paper, we aim to prevent such learning behavior of applying frequency shortcuts from a data-driven perspective. Given the theoretical justification of models' biased learning behavior on different spatial frequency components, which is based on the dataset frequency properties, we argue that the learning behavior on various frequency components could be manipulated by changing the dataset statistical structure in the Fourier domain. Intuitively, as frequency shortcuts are hidden in the dominant and highly dependent frequencies of dataset structure, dynamically perturbating the over-reliance frequency components could prevent the application of frequency shortcuts. To this end, we propose two effective data augmentation modules designed to collaboratively and adaptively adjust the frequency characteristic of the dataset, aiming to dynamically influence the learning behavior of the model and ultimately serving as a strategy to mitigate shortcut learning. Code is available at AdvFrequency (https://github.com/C0notSilly/AdvFrequency). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16146v1-abstract-full').style.display = 'none'; document.getElementById('2410.16146v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16090">arXiv:2410.16090</a> <span> [<a href="https://arxiv.org/pdf/2410.16090">pdf</a>, <a href="https://arxiv.org/format/2410.16090">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Analysing the Residual Stream of Language Models Under Knowledge Conflicts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaotang Du</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+G">Giwon Hong</a>, <a href="/search/cs?searchtype=author&query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&query=Devoto%2C+A">Alessio Devoto</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongru Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xuanli He</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K">Kam-Fai Wong</a>, <a href="/search/cs?searchtype=author&query=Minervini%2C+P">Pasquale Minervini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16090v1-abstract-short" style="display: inline;"> Large language models (LLMs) can store a significant amount of factual knowledge in their parameters. However, their parametric knowledge may conflict with the information provided in the context. Such conflicts can lead to undesirable model behaviour, such as reliance on outdated or incorrect information. In this work, we investigate whether LLMs can identify knowledge conflicts and whether it is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16090v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16090v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16090v1-abstract-full" style="display: none;"> Large language models (LLMs) can store a significant amount of factual knowledge in their parameters. However, their parametric knowledge may conflict with the information provided in the context. Such conflicts can lead to undesirable model behaviour, such as reliance on outdated or incorrect information. In this work, we investigate whether LLMs can identify knowledge conflicts and whether it is possible to know which source of knowledge the model will rely on by analysing the residual stream of the LLM. Through probing tasks, we find that LLMs can internally register the signal of knowledge conflict in the residual stream, which can be accurately detected by probing the intermediate model activations. This allows us to detect conflicts within the residual stream before generating the answers without modifying the input or model parameters. Moreover, we find that the residual stream shows significantly different patterns when the model relies on contextual knowledge versus parametric knowledge to resolve conflicts. This pattern can be employed to estimate the behaviour of LLMs when conflict happens and prevent unexpected answers before producing the answers. Our analysis offers insights into how LLMs internally manage knowledge conflicts and provides a foundation for developing methods to control the knowledge selection processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16090v1-abstract-full').style.display = 'none'; document.getElementById('2410.16090v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Foundation Model Interventions Workshop @ NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15999">arXiv:2410.15999</a> <span> [<a href="https://arxiv.org/pdf/2410.15999">pdf</a>, <a href="https://arxiv.org/format/2410.15999">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Steering Knowledge Selection Behaviours in LLMs via SAE-Based Representation Engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Devoto%2C+A">Alessio Devoto</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+G">Giwon Hong</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaotang Du</a>, <a href="/search/cs?searchtype=author&query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongru Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xuanli He</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K">Kam-Fai Wong</a>, <a href="/search/cs?searchtype=author&query=Minervini%2C+P">Pasquale Minervini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15999v2-abstract-short" style="display: inline;"> Large language models (LLMs) can store a significant amount of factual knowledge in their parameters. However, their parametric knowledge may conflict with the information provided in the context -- this phenomenon, known as \emph{context-memory knowledge conflicts}, can lead to undesirable model behaviour, such as reliance on outdated or incorrect information. Analysing the internal activations o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15999v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15999v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15999v2-abstract-full" style="display: none;"> Large language models (LLMs) can store a significant amount of factual knowledge in their parameters. However, their parametric knowledge may conflict with the information provided in the context -- this phenomenon, known as \emph{context-memory knowledge conflicts}, can lead to undesirable model behaviour, such as reliance on outdated or incorrect information. Analysing the internal activations of LLMs, we find that they can internally register the signals of knowledge conflict at mid-layers. Such signals allow us to detect whether a knowledge conflict occurs and use \emph{inference-time} intervention strategies to resolve it. In this work, we propose \textsc{SpARE}, a \emph{training-free} representation engineering method that uses pre-trained sparse auto-encoders (SAEs) to control the knowledge selection behaviour of LLMs. \textsc{SpARE} identifies the functional features that control the knowledge selection behaviours and applies them to edit the internal activations of LLMs at inference time. Our experimental results show that \textsc{SpARE} can effectively control the usage of either knowledge source to resolve knowledge conflict in open-domain question-answering tasks, surpassing existing representation engineering methods ($+10\%$) as well as contrastive decoding methods ($+15\%$). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15999v2-abstract-full').style.display = 'none'; document.getElementById('2410.15999v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15732">arXiv:2410.15732</a> <span> [<a href="https://arxiv.org/pdf/2410.15732">pdf</a>, <a href="https://arxiv.org/format/2410.15732">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ViMoE: An Empirical Study of Designing Vision Mixture-of-Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xumeng Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+L">Longhui Wei</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Qiang%2C+C">Chenhui Qiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xin He</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yingfei Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhenjun Han</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qi Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15732v1-abstract-short" style="display: inline;"> Mixture-of-Experts (MoE) models embody the divide-and-conquer concept and are a promising approach for increasing model capacity, demonstrating excellent scalability across multiple domains. In this paper, we integrate the MoE structure into the classic Vision Transformer (ViT), naming it ViMoE, and explore the potential of applying MoE to vision through a comprehensive study on image classificati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15732v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15732v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15732v1-abstract-full" style="display: none;"> Mixture-of-Experts (MoE) models embody the divide-and-conquer concept and are a promising approach for increasing model capacity, demonstrating excellent scalability across multiple domains. In this paper, we integrate the MoE structure into the classic Vision Transformer (ViT), naming it ViMoE, and explore the potential of applying MoE to vision through a comprehensive study on image classification. However, we observe that the performance is sensitive to the configuration of MoE layers, making it challenging to obtain optimal results without careful design. The underlying cause is that inappropriate MoE layers lead to unreliable routing and hinder experts from effectively acquiring helpful knowledge. To address this, we introduce a shared expert to learn and capture common information, serving as an effective way to construct stable ViMoE. Furthermore, we demonstrate how to analyze expert routing behavior, revealing which MoE layers are capable of specializing in handling specific information and which are not. This provides guidance for retaining the critical layers while removing redundancies, thereby advancing ViMoE to be more efficient without sacrificing accuracy. We aspire for this work to offer new insights into the design of vision MoE models and provide valuable empirical guidance for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15732v1-abstract-full').style.display = 'none'; document.getElementById('2410.15732v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14170">arXiv:2410.14170</a> <span> [<a href="https://arxiv.org/pdf/2410.14170">pdf</a>, <a href="https://arxiv.org/format/2410.14170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Personalized Image Generation with Large Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yiyan Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Biao%2C+T">Tang Biao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Peng Yan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fuli Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14170v1-abstract-short" style="display: inline;"> Personalized content filtering, such as recommender systems, has become a critical infrastructure to alleviate information overload. However, these systems merely filter existing content and are constrained by its limited diversity, making it difficult to meet users' varied content needs. To address this limitation, personalized content generation has emerged as a promising direction with broad ap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14170v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14170v1-abstract-full" style="display: none;"> Personalized content filtering, such as recommender systems, has become a critical infrastructure to alleviate information overload. However, these systems merely filter existing content and are constrained by its limited diversity, making it difficult to meet users' varied content needs. To address this limitation, personalized content generation has emerged as a promising direction with broad applications. Nevertheless, most existing research focuses on personalized text generation, with relatively little attention given to personalized image generation. The limited work in personalized image generation faces challenges in accurately capturing users' visual preferences and needs from noisy user-interacted images and complex multimodal instructions. Worse still, there is a lack of supervised data for training personalized image generation models. To overcome the challenges, we propose a Personalized Image Generation Framework named Pigeon, which adopts exceptional large multimodal models with three dedicated modules to capture users' visual preferences and needs from noisy user history and multimodal instructions. To alleviate the data scarcity, we introduce a two-stage preference alignment scheme, comprising masked preference reconstruction and pairwise preference alignment, to align Pigeon with the personalized image generation task. We apply Pigeon to personalized sticker and movie poster generation, where extensive quantitative results and human evaluation highlight its superiority over various generative baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14170v1-abstract-full').style.display = 'none'; document.getElementById('2410.14170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13285">arXiv:2410.13285</a> <span> [<a href="https://arxiv.org/pdf/2410.13285">pdf</a>, <a href="https://arxiv.org/format/2410.13285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Composing Novel Classes: A Concept-Driven Approach to Generalized Category Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+P">Peiyan Gu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xueyang Yu</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xuming He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13285v1-abstract-short" style="display: inline;"> We tackle the generalized category discovery (GCD) problem, which aims to discover novel classes in unlabeled datasets by leveraging the knowledge of known classes. Previous works utilize the known class knowledge through shared representation spaces. Despite their progress, our analysis experiments show that novel classes can achieve impressive clustering results on the feature space of a known c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13285v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13285v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13285v1-abstract-full" style="display: none;"> We tackle the generalized category discovery (GCD) problem, which aims to discover novel classes in unlabeled datasets by leveraging the knowledge of known classes. Previous works utilize the known class knowledge through shared representation spaces. Despite their progress, our analysis experiments show that novel classes can achieve impressive clustering results on the feature space of a known class pre-trained model, suggesting that existing methods may not fully utilize known class knowledge. To address it, we introduce a novel concept learning framework for GCD, named ConceptGCD, that categorizes concepts into two types: derivable and underivable from known class concepts, and adopts a stage-wise learning strategy to learn them separately. Specifically, our framework first extracts known class concepts by a known class pre-trained model and then produces derivable concepts from them by a generator layer with a covariance-augmented loss. Subsequently, we expand the generator layer to learn underivable concepts in a balanced manner ensured by a concept score normalization strategy and integrate a contrastive loss to preserve previously learned concepts. Extensive experiments on various benchmark datasets demonstrate the superiority of our approach over the previous state-of-the-art methods. Code will be available soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13285v1-abstract-full').style.display = 'none'; document.getElementById('2410.13285v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Underreview. The first two authors contribute equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12836">arXiv:2410.12836</a> <span> [<a href="https://arxiv.org/pdf/2410.12836">pdf</a>, <a href="https://arxiv.org/format/2410.12836">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> EditRoom: LLM-parameterized Graph Diffusion for Composable 3D Room Layout Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kaizhi Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaotong Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xuehai He</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jing Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kevin Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X+E">Xin Eric Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12836v1-abstract-short" style="display: inline;"> Given the steep learning curve of professional 3D software and the time-consuming process of managing large 3D assets, language-guided 3D scene editing has significant potential in fields such as virtual reality, augmented reality, and gaming. However, recent approaches to language-guided 3D scene editing either require manual interventions or focus only on appearance modifications without support… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12836v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12836v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12836v1-abstract-full" style="display: none;"> Given the steep learning curve of professional 3D software and the time-consuming process of managing large 3D assets, language-guided 3D scene editing has significant potential in fields such as virtual reality, augmented reality, and gaming. However, recent approaches to language-guided 3D scene editing either require manual interventions or focus only on appearance modifications without supporting comprehensive scene layout changes. In response, we propose Edit-Room, a unified framework capable of executing a variety of layout edits through natural language commands, without requiring manual intervention. Specifically, EditRoom leverages Large Language Models (LLMs) for command planning and generates target scenes using a diffusion-based method, enabling six types of edits: rotate, translate, scale, replace, add, and remove. To address the lack of data for language-guided 3D scene editing, we have developed an automatic pipeline to augment existing 3D scene synthesis datasets and introduced EditRoom-DB, a large-scale dataset with 83k editing pairs, for training and evaluation. Our experiments demonstrate that our approach consistently outperforms other baselines across all metrics, indicating higher accuracy and coherence in language-guided scene layout editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12836v1-abstract-full').style.display = 'none'; document.getElementById('2410.12836v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12707">arXiv:2410.12707</a> <span> [<a href="https://arxiv.org/pdf/2410.12707">pdf</a>, <a href="https://arxiv.org/format/2410.12707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FusionLLM: A Decentralized LLM Training System on Geo-distributed GPUs with Adaptive Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+X">Xueze Kang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yiming Yin</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xinglin Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxin Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xin He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiang Wang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+R">Rongfei Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kaiyong Zhao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Shaohuai Shi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A+C">Amelie Chi Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bingsheng He</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12707v1-abstract-short" style="display: inline;"> To alleviate hardware scarcity in training large deep neural networks (DNNs), particularly large language models (LLMs), we present FusionLLM, a decentralized training system designed and implemented for training DNNs using geo-distributed GPUs across different computing clusters or individual devices. Decentralized training faces significant challenges regarding system design and efficiency, incl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12707v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12707v1-abstract-full" style="display: none;"> To alleviate hardware scarcity in training large deep neural networks (DNNs), particularly large language models (LLMs), we present FusionLLM, a decentralized training system designed and implemented for training DNNs using geo-distributed GPUs across different computing clusters or individual devices. Decentralized training faces significant challenges regarding system design and efficiency, including: 1) the need for remote automatic differentiation (RAD), 2) support for flexible model definitions and heterogeneous software, 3) heterogeneous hardware leading to low resource utilization or the straggler problem, and 4) slow network communication. To address these challenges, in the system design, we represent the model as a directed acyclic graph of operators (OP-DAG). Each node in the DAG represents the operator in the DNNs, while the edge represents the data dependency between operators. Based on this design, 1) users are allowed to customize any DNN without caring low-level operator implementation; 2) we enable the task scheduling with the more fine-grained sub-tasks, offering more optimization space; 3) a DAG runtime executor can implement RAD withour requiring the consistent low-level ML framework versions. To enhance system efficiency, we implement a workload estimator and design an OP-Fence scheduler to cluster devices with similar bandwidths together and partition the DAG to increase throughput. Additionally, we propose an AdaTopK compressor to adaptively compress intermediate activations and gradients at the slowest communication links. To evaluate the convergence and efficiency of our system and algorithms, we train ResNet-101 and GPT-2 on three real-world testbeds using 48 GPUs connected with 8 Mbps~10 Gbps networks. Experimental results demonstrate that our system and method can achieve 1.45 - 9.39x speedup compared to baseline methods while ensuring convergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12707v1-abstract-full').style.display = 'none'; document.getElementById('2410.12707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12519">arXiv:2410.12519</a> <span> [<a href="https://arxiv.org/pdf/2410.12519">pdf</a>, <a href="https://arxiv.org/format/2410.12519">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> RosePO: Aligning LLM-based Recommenders with Human Values </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiayi Liao</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiancan Wu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yancheng Yuan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Z">Zhanhui Kang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12519v1-abstract-short" style="display: inline;"> Recently, there has been a growing interest in leveraging Large Language Models (LLMs) for recommendation systems, which usually adapt a pre-trained LLM to the recommendation scenario through supervised fine-tuning (SFT). However, both the pre-training and SFT stages fail to explicitly model the comparative relationships of a user's preferences on different items. To construct a "helpful and harml… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12519v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12519v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12519v1-abstract-full" style="display: none;"> Recently, there has been a growing interest in leveraging Large Language Models (LLMs) for recommendation systems, which usually adapt a pre-trained LLM to the recommendation scenario through supervised fine-tuning (SFT). However, both the pre-training and SFT stages fail to explicitly model the comparative relationships of a user's preferences on different items. To construct a "helpful and harmless" LLM-based recommender, we propose a general framework -- Recommendation with smoothing personalized Preference Optimization (RosePO), which better aligns with customized human values during the post-training stage. Specifically, in addition to the input and chosen response that naturally align with SFT data, we design a rejected sampling strategy tailored for enhancing helpfulness, along with two strategies aimed at mitigating biases to promote harmlessness. To ensure robustness against uncertain labels present in automatically constructed preference data, we introduce a personalized smoothing factor predicted by a preference oracle into the optimization objective. Evaluation on three real-world datasets demonstrates the effectiveness of our method, showcasing not only improved recommendation performance but also mitigation of semantic hallucination and popularity bias. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12519v1-abstract-full').style.display = 'none'; document.getElementById('2410.12519v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12229">arXiv:2410.12229</a> <span> [<a href="https://arxiv.org/pdf/2410.12229">pdf</a>, <a href="https://arxiv.org/format/2410.12229">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Comprehending Knowledge Graphs with Large Language Models for Recommender Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Ziqiang Cui</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+Y">Yunpeng Weng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xing Tang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+F">Fuyuan Lyu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dugang Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiuqiang He</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chen Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12229v1-abstract-short" style="display: inline;"> Recently, the introduction of knowledge graphs (KGs) has significantly advanced recommender systems by facilitating the discovery of potential associations between items. However, existing methods still face several limitations. First, most KGs suffer from missing facts or limited scopes. This can lead to biased knowledge representations, thereby constraining the model's performance. Second, exist… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12229v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12229v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12229v1-abstract-full" style="display: none;"> Recently, the introduction of knowledge graphs (KGs) has significantly advanced recommender systems by facilitating the discovery of potential associations between items. However, existing methods still face several limitations. First, most KGs suffer from missing facts or limited scopes. This can lead to biased knowledge representations, thereby constraining the model's performance. Second, existing methods typically convert textual information into IDs, resulting in the loss of natural semantic connections between different items. Third, existing methods struggle to capture high-order relationships in global KGs due to their inefficient layer-by-layer information propagation mechanisms, which are prone to introducing significant noise. To address these limitations, we propose a novel method called CoLaKG, which leverages large language models (LLMs) for knowledge-aware recommendation. The extensive world knowledge and remarkable reasoning capabilities of LLMs enable them to supplement KGs. Additionally, the strong text comprehension abilities of LLMs allow for a better understanding of semantic information. Based on this, we first extract subgraphs centered on each item from the KG and convert them into textual inputs for the LLM. The LLM then outputs its comprehension of these item-centered subgraphs, which are subsequently transformed into semantic embeddings. Furthermore, to utilize the global information of the KG, we construct an item-item graph using these semantic embeddings, which can directly capture higher-order associations between items. Both the semantic embeddings and the structural information from the item-item graph are effectively integrated into the recommendation model through our designed representation alignment and neighbor augmentation modules. Extensive experiments on four real-world datasets demonstrate the superiority of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12229v1-abstract-full').style.display = 'none'; document.getElementById('2410.12229v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11825">arXiv:2410.11825</a> <span> [<a href="https://arxiv.org/pdf/2410.11825">pdf</a>, <a href="https://arxiv.org/format/2410.11825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning Smooth Humanoid Locomotion through Lipschitz-Constrained Policies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zixuan Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xialin He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yen-Jen Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Q">Qiayuan Liao</a>, <a href="/search/cs?searchtype=author&query=Ze%2C+Y">Yanjie Ze</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhongyu Li</a>, <a href="/search/cs?searchtype=author&query=Sastry%2C+S+S">S. Shankar Sastry</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiajun Wu</a>, <a href="/search/cs?searchtype=author&query=Sreenath%2C+K">Koushil Sreenath</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+S">Saurabh Gupta</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X+B">Xue Bin Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11825v3-abstract-short" style="display: inline;"> Reinforcement learning combined with sim-to-real transfer offers a general framework for developing locomotion controllers for legged robots. To facilitate successful deployment in the real world, smoothing techniques, such as low-pass filters and smoothness rewards, are often employed to develop policies with smooth behaviors. However, because these techniques are non-differentiable and usually r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11825v3-abstract-full').style.display = 'inline'; document.getElementById('2410.11825v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11825v3-abstract-full" style="display: none;"> Reinforcement learning combined with sim-to-real transfer offers a general framework for developing locomotion controllers for legged robots. To facilitate successful deployment in the real world, smoothing techniques, such as low-pass filters and smoothness rewards, are often employed to develop policies with smooth behaviors. However, because these techniques are non-differentiable and usually require tedious tuning of a large set of hyperparameters, they tend to require extensive manual tuning for each robotic platform. To address this challenge and establish a general technique for enforcing smooth behaviors, we propose a simple and effective method that imposes a Lipschitz constraint on a learned policy, which we refer to as Lipschitz-Constrained Policies (LCP). We show that the Lipschitz constraint can be implemented in the form of a gradient penalty, which provides a differentiable objective that can be easily incorporated with automatic differentiation frameworks. We demonstrate that LCP effectively replaces the need for smoothing rewards or low-pass filters and can be easily integrated into training frameworks for many distinct humanoid robots. We extensively evaluate LCP in both simulation and real-world humanoid robots, producing smooth and robust locomotion controllers. All simulation and deployment code, along with complete checkpoints, is available on our project page: https://lipschitz-constrained-policy.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11825v3-abstract-full').style.display = 'none'; document.getElementById('2410.11825v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11345">arXiv:2410.11345</a> <span> [<a href="https://arxiv.org/pdf/2410.11345">pdf</a>, <a href="https://arxiv.org/format/2410.11345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Visual Manipulation with Legs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xialin He</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chengjing Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+R">Ruihan Yang</a>, <a href="/search/cs?searchtype=author&query=Held%2C+D">David Held</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11345v3-abstract-short" style="display: inline;"> Animals use limbs for both locomotion and manipulation. We aim to equip quadruped robots with similar versatility. This work introduces a system that enables quadruped robots to interact with objects using their legs, inspired by non-prehensile manipulation. The system has two main components: a visual manipulation policy module and a loco-manipulator module. The visual manipulation policy, traine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11345v3-abstract-full').style.display = 'inline'; document.getElementById('2410.11345v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11345v3-abstract-full" style="display: none;"> Animals use limbs for both locomotion and manipulation. We aim to equip quadruped robots with similar versatility. This work introduces a system that enables quadruped robots to interact with objects using their legs, inspired by non-prehensile manipulation. The system has two main components: a visual manipulation policy module and a loco-manipulator module. The visual manipulation policy, trained with reinforcement learning (RL) using point cloud observations and object-centric actions, decides how the leg should interact with the object. The loco-manipulator controller manages leg movements and body pose adjustments, based on impedance control and Model Predictive Control (MPC). Besides manipulating objects with a single leg, the system can select from the left or right leg based on critic maps and move objects to distant goals through base adjustment. Experiments evaluate the system on object pose alignment tasks in both simulation and the real world, demonstrating more versatile object manipulation skills with legs than previous work. Videos can be found at https://legged-manipulation.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11345v3-abstract-full').style.display = 'none'; document.getElementById('2410.11345v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">More details can be found on our project page: https://legged-manipulation.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10815">arXiv:2410.10815</a> <span> [<a href="https://arxiv.org/pdf/2410.10815">pdf</a>, <a href="https://arxiv.org/format/2410.10815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Depth Any Video with Scalable Synthetic Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Honghui Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wei Yin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haifeng Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10815v1-abstract-short" style="display: inline;"> Video depth estimation has long been hindered by the scarcity of consistent and scalable ground truth data, leading to inconsistent and unreliable results. In this paper, we introduce Depth Any Video, a model that tackles the challenge through two key innovations. First, we develop a scalable synthetic data pipeline, capturing real-time video depth data from diverse synthetic environments, yieldin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10815v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10815v1-abstract-full" style="display: none;"> Video depth estimation has long been hindered by the scarcity of consistent and scalable ground truth data, leading to inconsistent and unreliable results. In this paper, we introduce Depth Any Video, a model that tackles the challenge through two key innovations. First, we develop a scalable synthetic data pipeline, capturing real-time video depth data from diverse synthetic environments, yielding 40,000 video clips of 5-second duration, each with precise depth annotations. Second, we leverage the powerful priors of generative video diffusion models to handle real-world videos effectively, integrating advanced techniques such as rotary position encoding and flow matching to further enhance flexibility and efficiency. Unlike previous models, which are limited to fixed-length video sequences, our approach introduces a novel mixed-duration training strategy that handles videos of varying lengths and performs robustly across different frame rates-even on single frames. At inference, we propose a depth interpolation method that enables our model to infer high-resolution video depth across sequences of up to 150 frames. Our model outperforms all previous generative depth models in terms of spatial accuracy and temporal consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10815v1-abstract-full').style.display = 'none'; document.getElementById('2410.10815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://depthanyvideo.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10803">arXiv:2410.10803</a> <span> [<a href="https://arxiv.org/pdf/2410.10803">pdf</a>, <a href="https://arxiv.org/format/2410.10803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Humanoid Manipulation with Improved 3D Diffusion Policies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ze%2C+Y">Yanjie Ze</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianyi Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xialin He</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ying Yuan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X+B">Xue Bin Peng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiajun Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10803v1-abstract-short" style="display: inline;"> Humanoid robots capable of autonomous operation in diverse environments have long been a goal for roboticists. However, autonomous manipulation by humanoid robots has largely been restricted to one specific scene, primarily due to the difficulty of acquiring generalizable skills. Recent advances in 3D visuomotor policies, such as the 3D Diffusion Policy (DP3), have shown promise in extending these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10803v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10803v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10803v1-abstract-full" style="display: none;"> Humanoid robots capable of autonomous operation in diverse environments have long been a goal for roboticists. However, autonomous manipulation by humanoid robots has largely been restricted to one specific scene, primarily due to the difficulty of acquiring generalizable skills. Recent advances in 3D visuomotor policies, such as the 3D Diffusion Policy (DP3), have shown promise in extending these capabilities to wilder environments. However, 3D visuomotor policies often rely on camera calibration and point-cloud segmentation, which present challenges for deployment on mobile robots like humanoids. In this work, we introduce the Improved 3D Diffusion Policy (iDP3), a novel 3D visuomotor policy that eliminates these constraints by leveraging egocentric 3D visual representations. We demonstrate that iDP3 enables a full-sized humanoid robot to autonomously perform skills in diverse real-world scenarios, using only data collected in the lab. Videos are available at: https://humanoid-manipulation.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10803v1-abstract-full').style.display = 'none'; document.getElementById('2410.10803v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://humanoid-manipulation.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10563">arXiv:2410.10563</a> <span> [<a href="https://arxiv.org/pdf/2410.10563">pdf</a>, <a href="https://arxiv.org/format/2410.10563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MEGA-Bench: Scaling Multimodal Evaluation to over 500 Real-World Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiacheng Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+T">Tianhao Liang</a>, <a href="/search/cs?searchtype=author&query=Siu%2C+S">Sherman Siu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengqing Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yubo Wang</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Y">Yuansheng Ni</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wang Zhu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+B">Bohan Lyu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Dongfu Jiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xuan He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuan Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiang Yue</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenhu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10563v2-abstract-short" style="display: inline;"> We present MEGA-Bench, an evaluation suite that scales multimodal evaluation to over 500 real-world tasks, to address the highly heterogeneous daily use cases of end users. Our objective is to optimize for a set of high-quality data samples that cover a highly diverse and rich set of multimodal tasks, while enabling cost-effective and accurate model evaluation. In particular, we collected 505 real… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10563v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10563v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10563v2-abstract-full" style="display: none;"> We present MEGA-Bench, an evaluation suite that scales multimodal evaluation to over 500 real-world tasks, to address the highly heterogeneous daily use cases of end users. Our objective is to optimize for a set of high-quality data samples that cover a highly diverse and rich set of multimodal tasks, while enabling cost-effective and accurate model evaluation. In particular, we collected 505 realistic tasks encompassing over 8,000 samples from 16 expert annotators to extensively cover the multimodal task space. Instead of unifying these problems into standard multi-choice questions (like MMMU, MMBench, and MMT-Bench), we embrace a wide range of output formats like numbers, phrases, code, \LaTeX, coordinates, JSON, free-form, etc. To accommodate these formats, we developed over 40 metrics to evaluate these tasks. Unlike existing benchmarks, MEGA-Bench offers a fine-grained capability report across multiple dimensions (e.g., application, input type, output format, skill), allowing users to interact with and visualize model capabilities in depth. We evaluate a wide variety of frontier vision-language models on MEGA-Bench to understand their capabilities across these dimensions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10563v2-abstract-full').style.display = 'none'; document.getElementById('2410.10563v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report. Project page: https://tiger-ai-lab.github.io/MEGA-Bench/. v2 includes more evaluated models and a single-image setting</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10148">arXiv:2410.10148</a> <span> [<a href="https://arxiv.org/pdf/2410.10148">pdf</a>, <a href="https://arxiv.org/format/2410.10148">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> $伪$-DPO: Adaptive Reward Margin is What Direct Preference Optimization Needs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junkang Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xue Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyi Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiancan Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jinyang Gao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+B">Bolin Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10148v3-abstract-short" style="display: inline;"> Aligning large language models (LLMs) with human values and intentions is crucial for their utility, honesty, and safety. Reinforcement learning from human feedback (RLHF) is a popular approach to achieve this alignment, but it faces challenges in computational efficiency and training stability. Recent methods like Direct Preference Optimization (DPO) and Simple Preference Optimization (SimPO) hav… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10148v3-abstract-full').style.display = 'inline'; document.getElementById('2410.10148v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10148v3-abstract-full" style="display: none;"> Aligning large language models (LLMs) with human values and intentions is crucial for their utility, honesty, and safety. Reinforcement learning from human feedback (RLHF) is a popular approach to achieve this alignment, but it faces challenges in computational efficiency and training stability. Recent methods like Direct Preference Optimization (DPO) and Simple Preference Optimization (SimPO) have proposed offline alternatives to RLHF, simplifying the process by reparameterizing the reward function. However, DPO depends on a potentially suboptimal reference model, and SimPO's assumption of a fixed target reward margin may lead to suboptimal decisions in diverse data settings. In this work, we propose $伪$-DPO, an adaptive preference optimization algorithm designed to address these limitations by introducing a dynamic reward margin. Specifically, $伪$-DPO employs an adaptive preference distribution, balancing the policy model and the reference model to achieve personalized reward margins. We provide theoretical guarantees for $伪$-DPO, demonstrating its effectiveness as a surrogate optimization objective and its ability to balance alignment and diversity through KL divergence control. Empirical evaluations on AlpacaEval 2 and Arena-Hard show that $伪$-DPO consistently outperforms DPO and SimPO across various model settings, establishing it as a robust approach for fine-tuning LLMs. Our method achieves significant improvements in win rates, highlighting its potential as a powerful tool for LLM alignment. The code is available at https://github.com/junkangwu/alpha-DPO <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10148v3-abstract-full').style.display = 'none'; document.getElementById('2410.10148v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09865">arXiv:2410.09865</a> <span> [<a href="https://arxiv.org/pdf/2410.09865">pdf</a>, <a href="https://arxiv.org/format/2410.09865">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SynFER: Towards Boosting Facial Expression Recognition with Synthetic Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xilin He</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Cheng Luo</a>, <a href="/search/cs?searchtype=author&query=Xian%2C+X">Xiaole Xian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bing Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Siyang Song</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+M+H">Muhammad Haris Khan</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Weicheng Xie</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Linlin Shen</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zongyuan Ge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09865v2-abstract-short" style="display: inline;"> Facial expression datasets remain limited in scale due to privacy concerns, the subjectivity of annotations, and the labor-intensive nature of data collection. This limitation poses a significant challenge for developing modern deep learning-based facial expression analysis models, particularly foundation models, that rely on large-scale data for optimal performance. To tackle the overarching and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09865v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09865v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09865v2-abstract-full" style="display: none;"> Facial expression datasets remain limited in scale due to privacy concerns, the subjectivity of annotations, and the labor-intensive nature of data collection. This limitation poses a significant challenge for developing modern deep learning-based facial expression analysis models, particularly foundation models, that rely on large-scale data for optimal performance. To tackle the overarching and complex challenge, we introduce SynFER (Synthesis of Facial Expressions with Refined Control), a novel framework for synthesizing facial expression image data based on high-level textual descriptions as well as more fine-grained and precise control through facial action units. To ensure the quality and reliability of the synthetic data, we propose a semantic guidance technique to steer the generation process and a pseudo-label generator to help rectify the facial expression labels for the synthetic images. To demonstrate the generation fidelity and the effectiveness of the synthetic data from SynFER, we conduct extensive experiments on representation learning using both synthetic data and real-world data. Experiment results validate the efficacy of the proposed approach and the synthetic data. Notably, our approach achieves a 67.23% classification accuracy on AffectNet when training solely with synthetic data equivalent to the AffectNet training set size, which increases to 69.84% when scaling up to five times the original size. Our code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09865v2-abstract-full').style.display = 'none'; document.getElementById('2410.09865v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updated Results</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09854">arXiv:2410.09854</a> <span> [<a href="https://arxiv.org/pdf/2410.09854">pdf</a>, <a href="https://arxiv.org/format/2410.09854">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> A Model Is Not Built By A Single Prompt: LLM-Based Domain Modeling With Question Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ru Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jingwei Shen</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiao He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09854v1-abstract-short" style="display: inline;"> Domain modeling, a crucial part of model-driven engineering, demands extensive domain knowledge and experience from engineers. When the system description is highly complicated, the modeling task can become particularly challenging and time-consuming. Large language Models(LLMs) can assist by automatically generating an initial object model from the system description. Although LLMs have demonstra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09854v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09854v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09854v1-abstract-full" style="display: none;"> Domain modeling, a crucial part of model-driven engineering, demands extensive domain knowledge and experience from engineers. When the system description is highly complicated, the modeling task can become particularly challenging and time-consuming. Large language Models(LLMs) can assist by automatically generating an initial object model from the system description. Although LLMs have demonstrated remarkable code-generation ability, they still struggle with model-generation using a single prompt. In real-world domain modeling, engineers usually decompose complex tasks into easily solvable sub-tasks, significantly controlling complexity and enhancing model quality. Inspired by this, we propose an LLM-based domain modeling approach via question decomposition, similar to developer's modeling process. Following conventional modeling guidelines, we divide the model generation task into several sub-tasks, i.e., class generation, association and aggregation generation, and inheritance generation. For each sub-task, we carefully design the prompt by choosing more efficient query words and providing essential modeling knowledge to unlock the modeling potential of LLMs. To sum up all the sub-tasks solutions, we implemente a proof-of-object tool integrated into the standard Ecore editor that asks LLMs to generate an object model from the system description. We evaluate our approach with 20 systems from different application domains. The preliminary results show that our approach outperforms the single-prompt-based prompt by improving recall values and F1 scores in most systems for modeling the classes, attributes, and relationships. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09854v1-abstract-full').style.display = 'none'; document.getElementById('2410.09854v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09382">arXiv:2410.09382</a> <span> [<a href="https://arxiv.org/pdf/2410.09382">pdf</a>, <a href="https://arxiv.org/format/2410.09382">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CLIP-SCGI: Synthesized Caption-Guided Inversion for Person Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Q">Qianru Han</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xinwei He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhi Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sannyuya Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+J">Jinhai Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09382v1-abstract-short" style="display: inline;"> Person re-identification (ReID) has recently benefited from large pretrained vision-language models such as Contrastive Language-Image Pre-Training (CLIP). However, the absence of concrete descriptions necessitates the use of implicit text embeddings, which demand complicated and inefficient training strategies. To address this issue, we first propose one straightforward solution by leveraging exi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09382v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09382v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09382v1-abstract-full" style="display: none;"> Person re-identification (ReID) has recently benefited from large pretrained vision-language models such as Contrastive Language-Image Pre-Training (CLIP). However, the absence of concrete descriptions necessitates the use of implicit text embeddings, which demand complicated and inefficient training strategies. To address this issue, we first propose one straightforward solution by leveraging existing image captioning models to generate pseudo captions for person images, and thereby boost person re-identification with large vision language models. Using models like the Large Language and Vision Assistant (LLAVA), we generate high-quality captions based on fixed templates that capture key semantic attributes such as gender, clothing, and age. By augmenting ReID training sets from uni-modality (image) to bi-modality (image and text), we introduce CLIP-SCGI, a simple yet effective framework that leverages synthesized captions to guide the learning of discriminative and robust representations. Built on CLIP, CLIP-SCGI fuses image and text embeddings through two modules to enhance the training process. To address quality issues in generated captions, we introduce a caption-guided inversion module that captures semantic attributes from images by converting relevant visual information into pseudo-word tokens based on the descriptions. This approach helps the model better capture key information and focus on relevant regions. The extracted features are then utilized in a cross-modal fusion module, guiding the model to focus on regions semantically consistent with the caption, thereby facilitating the optimization of the visual encoder to extract discriminative and robust representations. Extensive experiments on four popular ReID benchmarks demonstrate that CLIP-SCGI outperforms the state-of-the-art by a significant margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09382v1-abstract-full').style.display = 'none'; document.getElementById('2410.09382v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08282">arXiv:2410.08282</a> <span> [<a href="https://arxiv.org/pdf/2410.08282">pdf</a>, <a href="https://arxiv.org/format/2410.08282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> FusionSense: Bridging Common Sense, Vision, and Touch for Robust Sparse-View Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+I">Irving Fang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+K">Kairui Shi</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xujin He</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+S">Siqi Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanwen Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hung-Jui Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wenzhen Yuan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08282v1-abstract-short" style="display: inline;"> Humans effortlessly integrate common-sense knowledge with sensory input from vision and touch to understand their surroundings. Emulating this capability, we introduce FusionSense, a novel 3D reconstruction framework that enables robots to fuse priors from foundation models with highly sparse observations from vision and tactile sensors. FusionSense addresses three key challenges: (i) How can robo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08282v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08282v1-abstract-full" style="display: none;"> Humans effortlessly integrate common-sense knowledge with sensory input from vision and touch to understand their surroundings. Emulating this capability, we introduce FusionSense, a novel 3D reconstruction framework that enables robots to fuse priors from foundation models with highly sparse observations from vision and tactile sensors. FusionSense addresses three key challenges: (i) How can robots efficiently acquire robust global shape information about the surrounding scene and objects? (ii) How can robots strategically select touch points on the object using geometric and common-sense priors? (iii) How can partial observations such as tactile signals improve the overall representation of the object? Our framework employs 3D Gaussian Splatting as a core representation and incorporates a hierarchical optimization strategy involving global structure construction, object visual hull pruning and local geometric constraints. This advancement results in fast and robust perception in environments with traditionally challenging objects that are transparent, reflective, or dark, enabling more downstream manipulation or navigation tasks. Experiments on real-world data suggest that our framework outperforms previously state-of-the-art sparse-view methods. All code and data are open-sourced on the project website. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08282v1-abstract-full').style.display = 'none'; document.getElementById('2410.08282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.5; I.4.8 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08207">arXiv:2410.08207</a> <span> [<a href="https://arxiv.org/pdf/2410.08207">pdf</a>, <a href="https://arxiv.org/format/2410.08207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DICE: Discrete Inversion Enabling Controllable Editing for Multinomial Diffusion and Masked Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaoxiao He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Ligong Han</a>, <a href="/search/cs?searchtype=author&query=Dao%2C+Q">Quan Dao</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+S">Song Wen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+M">Minhao Bai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Di Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Min%2C+M+R">Martin Renqiang Min</a>, <a href="/search/cs?searchtype=author&query=Juefei-Xu%2C+F">Felix Juefei-Xu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chaowei Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongdong Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junzhou Huang</a>, <a href="/search/cs?searchtype=author&query=Ahmed%2C+F">Faez Ahmed</a>, <a href="/search/cs?searchtype=author&query=Srivastava%2C+A">Akash Srivastava</a>, <a href="/search/cs?searchtype=author&query=Metaxas%2C+D">Dimitris Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08207v1-abstract-short" style="display: inline;"> Discrete diffusion models have achieved success in tasks like image generation and masked language modeling but face limitations in controlled content editing. We introduce DICE (Discrete Inversion for Controllable Editing), the first approach to enable precise inversion for discrete diffusion models, including multinomial diffusion and masked generative models. By recording noise sequences and ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08207v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08207v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08207v1-abstract-full" style="display: none;"> Discrete diffusion models have achieved success in tasks like image generation and masked language modeling but face limitations in controlled content editing. We introduce DICE (Discrete Inversion for Controllable Editing), the first approach to enable precise inversion for discrete diffusion models, including multinomial diffusion and masked generative models. By recording noise sequences and masking patterns during the reverse diffusion process, DICE enables accurate reconstruction and flexible editing of discrete data without the need for predefined masks or attention manipulation. We demonstrate the effectiveness of DICE across both image and text domains, evaluating it on models such as VQ-Diffusion, Paella, and RoBERTa. Our results show that DICE preserves high data fidelity while enhancing editing capabilities, offering new opportunities for fine-grained content manipulation in discrete spaces. For project webpage, see https://hexiaoxiao-cs.github.io/DICE/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08207v1-abstract-full').style.display = 'none'; document.getElementById('2410.08207v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07654">arXiv:2410.07654</a> <span> [<a href="https://arxiv.org/pdf/2410.07654">pdf</a>, <a href="https://arxiv.org/format/2410.07654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Firzen: Firing Strict Cold-Start Items with Frozen Heterogeneous and Homogeneous Graphs for Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+H">Hulingxiao He</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangteng He</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yuxin Peng</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Z">Zifei Shan</a>, <a href="/search/cs?searchtype=author&query=Su%2C+X">Xin Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07654v1-abstract-short" style="display: inline;"> Recommendation models utilizing unique identities (IDs) to represent distinct users and items have dominated the recommender systems literature for over a decade. Since multi-modal content of items (e.g., texts and images) and knowledge graphs (KGs) may reflect the interaction-related users' preferences and items' characteristics, they have been utilized as useful side information to further impro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07654v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07654v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07654v1-abstract-full" style="display: none;"> Recommendation models utilizing unique identities (IDs) to represent distinct users and items have dominated the recommender systems literature for over a decade. Since multi-modal content of items (e.g., texts and images) and knowledge graphs (KGs) may reflect the interaction-related users' preferences and items' characteristics, they have been utilized as useful side information to further improve the recommendation quality. However, the success of such methods often limits to either warm-start or strict cold-start item recommendation in which some items neither appear in the training data nor have any interactions in the test stage: (1) Some fail to learn the embedding of a strict cold-start item since side information is only utilized to enhance the warm-start ID representations; (2) The others deteriorate the performance of warm-start recommendation since unrelated multi-modal content or entities in KGs may blur the final representations. In this paper, we propose a unified framework incorporating multi-modal content of items and KGs to effectively solve both strict cold-start and warm-start recommendation termed Firzen, which extracts the user-item collaborative information over frozen heterogeneous graph (collaborative knowledge graph), and exploits the item-item semantic structures and user-user behavioral association over frozen homogeneous graphs (item-item relation graph and user-user co-occurrence graph). Furthermore, we build four unified strict cold-start evaluation benchmarks based on publicly available Amazon datasets and a real-world industrial dataset from Weixin Channels via rearranging the interaction data and constructing KGs. Extensive empirical results demonstrate that our model yields significant improvements for strict cold-start recommendation and outperforms or matches the state-of-the-art performance in the warm-start scenario. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07654v1-abstract-full').style.display = 'none'; document.getElementById('2410.07654v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICDE 2024. The code is available at https://github.com/PKU-ICST-MIPL/Firzen_ICDE2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06626">arXiv:2410.06626</a> <span> [<a href="https://arxiv.org/pdf/2410.06626">pdf</a>, <a href="https://arxiv.org/format/2410.06626">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Open-RGBT: Open-vocabulary RGB-T Zero-shot Semantic Segmentation in Open-world Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+M">Meng Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Luojie Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xunjie He</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yufeng Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06626v1-abstract-short" style="display: inline;"> Semantic segmentation is a critical technique for effective scene understanding. Traditional RGB-T semantic segmentation models often struggle to generalize across diverse scenarios due to their reliance on pretrained models and predefined categories. Recent advancements in Visual Language Models (VLMs) have facilitated a shift from closed-set to open-vocabulary semantic segmentation methods. Howe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06626v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06626v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06626v1-abstract-full" style="display: none;"> Semantic segmentation is a critical technique for effective scene understanding. Traditional RGB-T semantic segmentation models often struggle to generalize across diverse scenarios due to their reliance on pretrained models and predefined categories. Recent advancements in Visual Language Models (VLMs) have facilitated a shift from closed-set to open-vocabulary semantic segmentation methods. However, these models face challenges in dealing with intricate scenes, primarily due to the heterogeneity between RGB and thermal modalities. To address this gap, we present Open-RGBT, a novel open-vocabulary RGB-T semantic segmentation model. Specifically, we obtain instance-level detection proposals by incorporating visual prompts to enhance category understanding. Additionally, we employ the CLIP model to assess image-text similarity, which helps correct semantic consistency and mitigates ambiguities in category identification. Empirical evaluations demonstrate that Open-RGBT achieves superior performance in diverse and challenging real-world scenarios, even in the wild, significantly advancing the field of RGB-T semantic segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06626v1-abstract-full').style.display = 'none'; document.getElementById('2410.06626v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05801">arXiv:2410.05801</a> <span> [<a href="https://arxiv.org/pdf/2410.05801">pdf</a>, <a href="https://arxiv.org/format/2410.05801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Retrieving, Rethinking and Revising: The Chain-of-Verification Can Improve Retrieval Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+B">Bolei He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nuo Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xinran He</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+L">Lingyong Yan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenkai Wei</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jinchang Luo</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05801v1-abstract-short" style="display: inline;"> Recent Retrieval Augmented Generation (RAG) aims to enhance Large Language Models (LLMs) by incorporating extensive knowledge retrieved from external sources. However, such approach encounters some challenges: Firstly, the original queries may not be suitable for precise retrieval, resulting in erroneous contextual knowledge; Secondly, the language model can easily generate inconsistent answer wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05801v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05801v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05801v1-abstract-full" style="display: none;"> Recent Retrieval Augmented Generation (RAG) aims to enhance Large Language Models (LLMs) by incorporating extensive knowledge retrieved from external sources. However, such approach encounters some challenges: Firstly, the original queries may not be suitable for precise retrieval, resulting in erroneous contextual knowledge; Secondly, the language model can easily generate inconsistent answer with external references due to their knowledge boundary limitation. To address these issues, we propose the chain-of-verification (CoV-RAG) to enhance the external retrieval correctness and internal generation consistency. Specifically, we integrate the verification module into the RAG, engaging in scoring, judgment, and rewriting. To correct external retrieval errors, CoV-RAG retrieves new knowledge using a revised query. To correct internal generation errors, we unify QA and verification tasks with a Chain-of-Thought (CoT) reasoning during training. Our comprehensive experiments across various LLMs demonstrate the effectiveness and adaptability compared with other strong baselines. Especially, our CoV-RAG can significantly surpass the state-of-the-art baselines using different LLM backbones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05801v1-abstract-full').style.display = 'none'; document.getElementById('2410.05801v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024 Findings. 9 pages, 4 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05785">arXiv:2410.05785</a> <span> [<a href="https://arxiv.org/pdf/2410.05785">pdf</a>, <a href="https://arxiv.org/format/2410.05785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Contextual Bandits with Non-Stationary Correlated Rewards for User Association in MmWave Vehicular Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaoyang He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaoxia Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lanhua Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05785v1-abstract-short" style="display: inline;"> Millimeter wave (mmWave) communication has emerged as a propelling technology in vehicular communication. Usually, an appropriate decision on user association requires timely channel information between vehicles and base stations (BSs), which is challenging given a fast-fading mmWave vehicular channel. In this paper, relying solely on learning transmission rate, we propose a low-complexity semi-di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05785v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05785v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05785v1-abstract-full" style="display: none;"> Millimeter wave (mmWave) communication has emerged as a propelling technology in vehicular communication. Usually, an appropriate decision on user association requires timely channel information between vehicles and base stations (BSs), which is challenging given a fast-fading mmWave vehicular channel. In this paper, relying solely on learning transmission rate, we propose a low-complexity semi-distributed contextual correlated upper confidence bound (SD-CC-UCB) algorithm to establish an up-to-date user association without explicit measurement of channel state information (CSI). Under a contextual multi-arm bandits framework, SD-CC-UCB learns and predicts the transmission rate given the location and velocity of the vehicle, which can adequately capture the intricate channel condition for a prompt decision on user association. Further, SD-CC-UCB efficiently identifies the set of candidate BSs which probably support supreme transmission rate by leveraging the correlated distributions of transmission rates on different locations. To further refine the learning transmission rate over the link to candidate BSs, each vehicle deploys the Thompson Sampling algorithm by taking the interference among vehicles and handover overhead into consideration. Numerical results show that our proposed algorithm achieves the network throughput within 100%-103% of a benchmark algorithm which requires perfect instantaneous CSI, demonstrating the effectiveness of SD-CC-UCB in vehicular communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05785v1-abstract-full').style.display = 'none'; document.getElementById('2410.05785v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05763">arXiv:2410.05763</a> <span> [<a href="https://arxiv.org/pdf/2410.05763">pdf</a>, <a href="https://arxiv.org/format/2410.05763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Information Discovery in e-Commerce </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+Z">Zhaochun Ren</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Dawei Yin</a>, <a href="/search/cs?searchtype=author&query=de+Rijke%2C+M">Maarten de Rijke</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05763v2-abstract-short" style="display: inline;"> Electronic commerce, or e-commerce, is the buying and selling of goods and services, or the transmitting of funds or data online. E-commerce platforms come in many kinds, with global players such as Amazon, Airbnb, Alibaba, eBay and platforms targeting specific geographic regions. Information retrieval has a natural role to play in e-commerce, especially in connecting people to goods and services.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05763v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05763v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05763v2-abstract-full" style="display: none;"> Electronic commerce, or e-commerce, is the buying and selling of goods and services, or the transmitting of funds or data online. E-commerce platforms come in many kinds, with global players such as Amazon, Airbnb, Alibaba, eBay and platforms targeting specific geographic regions. Information retrieval has a natural role to play in e-commerce, especially in connecting people to goods and services. Information discovery in e-commerce concerns different types of search (e.g., exploratory search vs. lookup tasks), recommender systems, and natural language processing in e-commerce portals. The rise in popularity of e-commerce sites has made research on information discovery in e-commerce an increasingly active research area. This is witnessed by an increase in publications and dedicated workshops in this space. Methods for information discovery in e-commerce largely focus on improving the effectiveness of e-commerce search and recommender systems, on enriching and using knowledge graphs to support e-commerce, and on developing innovative question answering and bot-based solutions that help to connect people to goods and services. In this survey, an overview is given of the fundamental infrastructure, algorithms, and technical solutions for information discovery in e-commerce. The topics covered include user behavior and profiling, search, recommendation, and language technology in e-commerce. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05763v2-abstract-full').style.display = 'none'; document.getElementById('2410.05763v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04886">arXiv:2410.04886</a> <span> [<a href="https://arxiv.org/pdf/2410.04886">pdf</a>, <a href="https://arxiv.org/format/2410.04886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> High Information Density and Low Coverage Data Storage in DNA with Efficient Channel Coding Schemes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yi Ding</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xuan He</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T+T">Tuan Thanh Nguyen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+W">Wentu Song</a>, <a href="/search/cs?searchtype=author&query=Yakhini%2C+Z">Zohar Yakhini</a>, <a href="/search/cs?searchtype=author&query=Yaakobi%2C+E">Eitan Yaakobi</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Linqiang Pan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiaohu Tang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+K">Kui Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04886v3-abstract-short" style="display: inline;"> DNA-based data storage has been attracting significant attention due to its extremely high data storage density, low power consumption, and long duration compared to conventional data storage media. Despite the recent advancements in DNA data storage technology, significant challenges remain. In particular, various types of errors can occur during the processes of DNA synthesis, storage, and seque… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04886v3-abstract-full').style.display = 'inline'; document.getElementById('2410.04886v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04886v3-abstract-full" style="display: none;"> DNA-based data storage has been attracting significant attention due to its extremely high data storage density, low power consumption, and long duration compared to conventional data storage media. Despite the recent advancements in DNA data storage technology, significant challenges remain. In particular, various types of errors can occur during the processes of DNA synthesis, storage, and sequencing, including substitution errors, insertion errors, and deletion errors. Furthermore, the entire oligo may be lost. In this work, we report a DNA-based data storage architecture that incorporates efficient channel coding schemes, including different types of error-correcting codes (ECCs) and constrained codes, for both the inner coding and outer coding for the DNA data storage channel. We also carried out large scale experiments to validate our proposed DNA-based data storage architecture. Specifically, 1.61 and 1.69 MB data were encoded into 30,000 oligos each, with information densities of 1.731 and 1.815, respectively. It has been found that the stored information can be fully recovered without any error at average coverages of 4.5 and 6.0, respectively. This experiment achieved the highest net information density and lowest coverage among existing DNA-based data storage experiments (with standard DNA), with data recovery rates and coverage approaching theoretical optima. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04886v3-abstract-full').style.display = 'none'; document.getElementById('2410.04886v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04691">arXiv:2410.04691</a> <span> [<a href="https://arxiv.org/pdf/2410.04691">pdf</a>, <a href="https://arxiv.org/format/2410.04691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Deeper Insights Without Updates: The Power of In-Context Learning Over Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Q">Qingyu Yin</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xuzheng He</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+L">Luoao Deng</a>, <a href="/search/cs?searchtype=author&query=Leong%2C+C+T">Chak Tou Leong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fan Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yanzhao Yan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiaoyu Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04691v1-abstract-short" style="display: inline;"> Fine-tuning and in-context learning (ICL) are two prevalent methods in imbuing large language models with task-specific knowledge. It is commonly believed that fine-tuning can surpass ICL given sufficient training samples as it allows the model to adjust its internal parameters based on the data. However, this paper presents a counterintuitive finding: For tasks with implicit patterns, ICL capture… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04691v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04691v1-abstract-full" style="display: none;"> Fine-tuning and in-context learning (ICL) are two prevalent methods in imbuing large language models with task-specific knowledge. It is commonly believed that fine-tuning can surpass ICL given sufficient training samples as it allows the model to adjust its internal parameters based on the data. However, this paper presents a counterintuitive finding: For tasks with implicit patterns, ICL captures these patterns significantly better than fine-tuning. We developed several datasets featuring implicit patterns, such as sequences determining answers through parity or identifying reducible terms in calculations. We then evaluated the models' understanding of these patterns under both fine-tuning and ICL across models ranging from 0.5B to 7B parameters. The results indicate that models employing ICL can quickly grasp deep patterns and significantly improve accuracy. In contrast, fine-tuning, despite utilizing thousands of times more training samples than ICL, achieved only limited improvements. We also proposed circuit shift theory from a mechanistic interpretability's view to explain why ICL wins. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04691v1-abstract-full').style.display = 'none'; document.getElementById('2410.04691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP'24 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04190">arXiv:2410.04190</a> <span> [<a href="https://arxiv.org/pdf/2410.04190">pdf</a>, <a href="https://arxiv.org/format/2410.04190">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Harnessing Task Overload for Scalable Jailbreak Attacks on Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiting Dong</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+G">Guobin Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dongcheng Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiang He</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yi Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04190v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) remain vulnerable to jailbreak attacks that bypass their safety mechanisms. Existing attack methods are fixed or specifically tailored for certain models and cannot flexibly adjust attack strength, which is critical for generalization when attacking models of various sizes. We introduce a novel scalable jailbreak attack that preempts the activation of an LLM's safety p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04190v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04190v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04190v1-abstract-full" style="display: none;"> Large Language Models (LLMs) remain vulnerable to jailbreak attacks that bypass their safety mechanisms. Existing attack methods are fixed or specifically tailored for certain models and cannot flexibly adjust attack strength, which is critical for generalization when attacking models of various sizes. We introduce a novel scalable jailbreak attack that preempts the activation of an LLM's safety policies by occupying its computational resources. Our method involves engaging the LLM in a resource-intensive preliminary task - a Character Map lookup and decoding process - before presenting the target instruction. By saturating the model's processing capacity, we prevent the activation of safety protocols when processing the subsequent instruction. Extensive experiments on state-of-the-art LLMs demonstrate that our method achieves a high success rate in bypassing safety measures without requiring gradient access, manual prompt engineering. We verified our approach offers a scalable attack that quantifies attack strength and adapts to different model scales at the optimal strength. We shows safety policies of LLMs might be more susceptible to resource constraints. Our findings reveal a critical vulnerability in current LLM safety designs, highlighting the need for more robust defense strategies that account for resource-intense condition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04190v1-abstract-full').style.display = 'none'; document.getElementById('2410.04190v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04107">arXiv:2410.04107</a> <span> [<a href="https://arxiv.org/pdf/2410.04107">pdf</a>, <a href="https://arxiv.org/format/2410.04107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TUBench: Benchmarking Large Vision-Language Models on Trustworthiness with Unanswerable Questions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xingwei He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qianru Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+A">A-Long Jin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Yiu%2C+S">Siu-Ming Yiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04107v1-abstract-short" style="display: inline;"> Large Vision-Language Models (LVLMs) have achieved remarkable progress on visual perception and linguistic interpretation. Despite their impressive capabilities across various tasks, LVLMs still suffer from the issue of hallucination, which involves generating content that is incorrect or unfaithful to the visual or textual inputs. Traditional benchmarks, such as MME and POPE, evaluate hallucinati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04107v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04107v1-abstract-full" style="display: none;"> Large Vision-Language Models (LVLMs) have achieved remarkable progress on visual perception and linguistic interpretation. Despite their impressive capabilities across various tasks, LVLMs still suffer from the issue of hallucination, which involves generating content that is incorrect or unfaithful to the visual or textual inputs. Traditional benchmarks, such as MME and POPE, evaluate hallucination in LVLMs within the scope of Visual Question Answering (VQA) using answerable questions. However, some questions are unanswerable due to insufficient information in the images, and the performance of LVLMs on such unanswerable questions remains underexplored. To bridge this research gap, we propose TUBench, a benchmark specifically designed to evaluate the reliability of LVLMs using unanswerable questions. TUBench comprises an extensive collection of high-quality, unanswerable questions that are meticulously crafted using ten distinct strategies. To thoroughly evaluate LVLMs, the unanswerable questions in TUBench are based on images from four diverse domains as visual contexts: screenshots of code snippets, natural images, geometry diagrams, and screenshots of statistical tables. These unanswerable questions are tailored to test LVLMs' trustworthiness in code reasoning, commonsense reasoning, geometric reasoning, and mathematical reasoning related to tables, respectively. We conducted a comprehensive quantitative evaluation of 28 leading foundational models on TUBench, with Gemini-1.5-Pro, the top-performing model, achieving an average accuracy of 69.2%, and GPT-4o, the third-ranked model, reaching 66.7% average accuracy, in determining whether questions are answerable. TUBench is available at https://github.com/NLPCode/TUBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04107v1-abstract-full').style.display = 'none'; document.getElementById('2410.04107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02832">arXiv:2410.02832</a> <span> [<a href="https://arxiv.org/pdf/2410.02832">pdf</a>, <a href="https://arxiv.org/format/2410.02832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FlipAttack: Jailbreak LLMs via Flipping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaoxin He</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+M">Miao Xiong</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+J">Jinlan Fu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+S">Shumin Deng</a>, <a href="/search/cs?searchtype=author&query=Hooi%2C+B">Bryan Hooi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02832v1-abstract-short" style="display: inline;"> This paper proposes a simple yet effective jailbreak attack named FlipAttack against black-box LLMs. First, from the autoregressive nature, we reveal that LLMs tend to understand the text from left to right and find that they struggle to comprehend the text when noise is added to the left side. Motivated by these insights, we propose to disguise the harmful prompt by constructing left-side noise m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02832v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02832v1-abstract-full" style="display: none;"> This paper proposes a simple yet effective jailbreak attack named FlipAttack against black-box LLMs. First, from the autoregressive nature, we reveal that LLMs tend to understand the text from left to right and find that they struggle to comprehend the text when noise is added to the left side. Motivated by these insights, we propose to disguise the harmful prompt by constructing left-side noise merely based on the prompt itself, then generalize this idea to 4 flipping modes. Second, we verify the strong ability of LLMs to perform the text-flipping task, and then develop 4 variants to guide LLMs to denoise, understand, and execute harmful behaviors accurately. These designs keep FlipAttack universal, stealthy, and simple, allowing it to jailbreak black-box LLMs within only 1 query. Experiments on 8 LLMs demonstrate the superiority of FlipAttack. Remarkably, it achieves $\sim$98\% attack success rate on GPT-4o, and $\sim$98\% bypass rate against 5 guardrail models on average. The codes are available at GitHub\footnote{https://github.com/yueliu1999/FlipAttack}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02832v1-abstract-full').style.display = 'none'; document.getElementById('2410.02832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">43 pages, 31 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02558">arXiv:2410.02558</a> <span> [<a href="https://arxiv.org/pdf/2410.02558">pdf</a>, <a href="https://arxiv.org/format/2410.02558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Improving Unsupervised Constituency Parsing via Maximizing Semantic Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junjie Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangheng He</a>, <a href="/search/cs?searchtype=author&query=Miyao%2C+Y">Yusuke Miyao</a>, <a href="/search/cs?searchtype=author&query=Bollegala%2C+D">Danushka Bollegala</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02558v1-abstract-short" style="display: inline;"> Unsupervised constituency parsers organize phrases within a sentence into a tree-shaped syntactic constituent structure that reflects the organization of sentence semantics. However, the traditional objective of maximizing sentence log-likelihood (LL) does not explicitly account for the close relationship between the constituent structure and the semantics, resulting in a weak correlation between… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02558v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02558v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02558v1-abstract-full" style="display: none;"> Unsupervised constituency parsers organize phrases within a sentence into a tree-shaped syntactic constituent structure that reflects the organization of sentence semantics. However, the traditional objective of maximizing sentence log-likelihood (LL) does not explicitly account for the close relationship between the constituent structure and the semantics, resulting in a weak correlation between LL values and parsing accuracy. In this paper, we introduce a novel objective for training unsupervised parsers: maximizing the information between constituent structures and sentence semantics (SemInfo). We introduce a bag-of-substrings model to represent the semantics and apply the probability-weighted information metric to estimate the SemInfo. Additionally, we develop a Tree Conditional Random Field (TreeCRF)-based model to apply the SemInfo maximization objective to Probabilistic Context-Free Grammar (PCFG) induction, the state-of-the-art method for unsupervised constituency parsing. Experiments demonstrate that SemInfo correlates more strongly with parsing accuracy than LL. Our algorithm significantly enhances parsing accuracy by an average of 7.85 points across five PCFG variants and in four languages, achieving new state-of-the-art results in three of the four languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02558v1-abstract-full').style.display = 'none'; document.getElementById('2410.02558v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02355">arXiv:2410.02355</a> <span> [<a href="https://arxiv.org/pdf/2410.02355">pdf</a>, <a href="https://arxiv.org/format/2410.02355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AlphaEdit: Null-Space Constrained Knowledge Editing for Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junfeng Fang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Houcheng Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yunshan Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02355v2-abstract-short" style="display: inline;"> Large language models (LLMs) often exhibit hallucinations due to incorrect or outdated knowledge. Hence, model editing methods have emerged to enable targeted knowledge updates. To achieve this, a prevailing paradigm is the locating-then-editing approach, which first locates influential parameters and then edits them by introducing a perturbation. While effective, current studies have demonstrated… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02355v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02355v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02355v2-abstract-full" style="display: none;"> Large language models (LLMs) often exhibit hallucinations due to incorrect or outdated knowledge. Hence, model editing methods have emerged to enable targeted knowledge updates. To achieve this, a prevailing paradigm is the locating-then-editing approach, which first locates influential parameters and then edits them by introducing a perturbation. While effective, current studies have demonstrated that this perturbation inevitably disrupt the originally preserved knowledge within LLMs, especially in sequential editing scenarios. To address this, we introduce AlphaEdit, a novel solution that projects perturbation onto the null space of the preserved knowledge before applying it to the parameters. We theoretically prove that this projection ensures the output of post-edited LLMs remains unchanged when queried about the preserved knowledge, thereby mitigating the issue of disruption. Extensive experiments on various LLMs, including LLaMA3, GPT2-XL, and GPT-J, show that AlphaEdit boosts the performance of most locating-then-editing methods by an average of 36.4% with a single line of additional code for projection solely. Our code is available at: https://github.com/jianghoucheng/AlphaEdit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02355v2-abstract-full').style.display = 'none'; document.getElementById('2410.02355v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02298">arXiv:2410.02298</a> <span> [<a href="https://arxiv.org/pdf/2410.02298">pdf</a>, <a href="https://arxiv.org/format/2410.02298">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Jailbreak Antidote: Runtime Safety-Utility Balance via Sparse Representation Adjustment in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+G">Guobin Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dongcheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiting Dong</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiang He</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yi Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02298v2-abstract-short" style="display: inline;"> As large language models (LLMs) become integral to various applications, ensuring both their safety and utility is paramount. Jailbreak attacks, which manipulate LLMs into generating harmful content, pose significant challenges to this balance. Existing defenses, such as prompt engineering and safety fine-tuning, often introduce computational overhead, increase inference latency, and lack runtime… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02298v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02298v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02298v2-abstract-full" style="display: none;"> As large language models (LLMs) become integral to various applications, ensuring both their safety and utility is paramount. Jailbreak attacks, which manipulate LLMs into generating harmful content, pose significant challenges to this balance. Existing defenses, such as prompt engineering and safety fine-tuning, often introduce computational overhead, increase inference latency, and lack runtime flexibility. Moreover, overly restrictive safety measures can degrade model utility by causing refusals of benign queries. In this paper, we introduce Jailbreak Antidote, a method that enables real-time adjustment of LLM safety preferences by manipulating a sparse subset of the model's internal states during inference. By shifting the model's hidden representations along a safety direction with varying strengths, we achieve flexible control over the safety-utility balance without additional token overhead or inference delays. Our analysis reveals that safety-related information in LLMs is sparsely distributed; adjusting approximately 5% of the internal state is as effective as modifying the entire state. Extensive experiments on nine LLMs (ranging from 2 billion to 72 billion parameters), evaluated against ten jailbreak attack methods and compared with six defense strategies, validate the effectiveness and efficiency of our approach. By directly manipulating internal states during reasoning, Jailbreak Antidote offers a lightweight, scalable solution that enhances LLM safety while preserving utility, opening new possibilities for real-time safety mechanisms in widely-deployed AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02298v2-abstract-full').style.display = 'none'; document.getElementById('2410.02298v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=He%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=He%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository