Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 195 results for author: <span class="mathjax">Hong, L</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Hong%2C+L">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Hong, L"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Hong%2C+L&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Hong, L"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hong%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hong%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hong%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hong%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Hong%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08277">arXiv:2502.08277</a> <span> [<a href="https://arxiv.org/pdf/2502.08277">pdf</a>, <a href="https://arxiv.org/format/2502.08277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> ChorusCVR: Chorus Supervision for Entire Space Post-Click Conversion Rate Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yucheng Lu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+B">Boyang Xia</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiangxia Cao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kuan Xu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+M">Mingxing Wen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wei Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaojie Liu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liyin Hong</a>, <a href="/search/cs?searchtype=author&query=Gai%2C+K">Kun Gai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guorui Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08277v2-abstract-short" style="display: inline;"> Post-click conversion rate (CVR) estimation is a vital task in many recommender systems of revenue businesses, e.g., e-commerce and advertising. In a perspective of sample, a typical CVR positive sample usually goes through a funnel of exposure to click to conversion. For lack of post-event labels for un-clicked samples, CVR learning task commonly only utilizes clicked samples, rather than all exp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08277v2-abstract-full').style.display = 'inline'; document.getElementById('2502.08277v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08277v2-abstract-full" style="display: none;"> Post-click conversion rate (CVR) estimation is a vital task in many recommender systems of revenue businesses, e.g., e-commerce and advertising. In a perspective of sample, a typical CVR positive sample usually goes through a funnel of exposure to click to conversion. For lack of post-event labels for un-clicked samples, CVR learning task commonly only utilizes clicked samples, rather than all exposed samples as for click-through rate (CTR) learning task. However, during online inference, CVR and CTR are estimated on the same assumed exposure space, which leads to a inconsistency of sample space between training and inference, i.e., sample selection bias (SSB). To alleviate SSB, previous wisdom proposes to design novel auxiliary tasks to enable the CVR learning on un-click training samples, such as CTCVR and counterfactual CVR, etc. Although alleviating SSB to some extent, none of them pay attention to the discrimination between ambiguous negative samples (un-clicked) and factual negative samples (clicked but un-converted) during modelling, which makes CVR model lacks robustness. To full this gap, we propose a novel ChorusCVR model to realize debiased CVR learning in entire-space. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08277v2-abstract-full').style.display = 'none'; document.getElementById('2502.08277v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06892">arXiv:2502.06892</a> <span> [<a href="https://arxiv.org/pdf/2502.06892">pdf</a>, <a href="https://arxiv.org/format/2502.06892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Certifying Language Model Robustness with Fuzzed Randomized Smoothing: An Efficient Defense Against Backdoor Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+B">Bowei He</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+L">Lihao Yin</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+H">Hui-Ling Zhen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianping Zhang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Mingxuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chen Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06892v1-abstract-short" style="display: inline;"> The widespread deployment of pre-trained language models (PLMs) has exposed them to textual backdoor attacks, particularly those planted during the pre-training stage. These attacks pose significant risks to high-reliability applications, as they can stealthily affect multiple downstream tasks. While certifying robustness against such threats is crucial, existing defenses struggle with the high-di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06892v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06892v1-abstract-full" style="display: none;"> The widespread deployment of pre-trained language models (PLMs) has exposed them to textual backdoor attacks, particularly those planted during the pre-training stage. These attacks pose significant risks to high-reliability applications, as they can stealthily affect multiple downstream tasks. While certifying robustness against such threats is crucial, existing defenses struggle with the high-dimensional, interdependent nature of textual data and the lack of access to original poisoned pre-training data. To address these challenges, we introduce \textbf{F}uzzed \textbf{R}andomized \textbf{S}moothing (\textbf{FRS}), a novel approach for efficiently certifying language model robustness against backdoor attacks. FRS integrates software robustness certification techniques with biphased model parameter smoothing, employing Monte Carlo tree search for proactive fuzzing to identify vulnerable textual segments within the Damerau-Levenshtein space. This allows for targeted and efficient text randomization, while eliminating the need for access to poisoned training data during model smoothing. Our theoretical analysis demonstrates that FRS achieves a broader certified robustness radius compared to existing methods. Extensive experiments across various datasets, model configurations, and attack strategies validate FRS's superiority in terms of defense efficiency, accuracy, and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06892v1-abstract-full').style.display = 'none'; document.getElementById('2502.06892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06557">arXiv:2502.06557</a> <span> [<a href="https://arxiv.org/pdf/2502.06557">pdf</a>, <a href="https://arxiv.org/format/2502.06557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> LiveForesighter: Generating Future Information for Live-Streaming Recommendations at Kuaishou </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yucheng Lu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiangxia Cao</a>, <a href="/search/cs?searchtype=author&query=Kuan%2C+X">Xu Kuan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wei Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Shuang%2C+Y">Yang Shuang</a>, <a href="/search/cs?searchtype=author&query=Zhaojie%2C+L">Liu Zhaojie</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liyin Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06557v1-abstract-short" style="display: inline;"> Live-streaming, as a new-generation media to connect users and authors, has attracted a lot of attention and experienced rapid growth in recent years. Compared with the content-static short-video recommendation, the live-streaming recommendation faces more challenges in giving our users a satisfactory experience: (1) Live-streaming content is dynamically ever-changing along time. (2) valuable beha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06557v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06557v1-abstract-full" style="display: none;"> Live-streaming, as a new-generation media to connect users and authors, has attracted a lot of attention and experienced rapid growth in recent years. Compared with the content-static short-video recommendation, the live-streaming recommendation faces more challenges in giving our users a satisfactory experience: (1) Live-streaming content is dynamically ever-changing along time. (2) valuable behaviors (e.g., send digital-gift, buy products) always require users to watch for a long-time (>10 min). Combining the two attributes, here raising a challenging question for live-streaming recommendation: How to discover the live-streamings that the content user is interested in at the current moment, and further a period in the future? <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06557v1-abstract-full').style.display = 'none'; document.getElementById('2502.06557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05772">arXiv:2502.05772</a> <span> [<a href="https://arxiv.org/pdf/2502.05772">pdf</a>, <a href="https://arxiv.org/format/2502.05772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Effective Black-Box Multi-Faceted Attacks Breach Vision Large Language Model Guardrails </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yijun Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lichao Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiao Yang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05772v1-abstract-short" style="display: inline;"> Vision Large Language Models (VLLMs) integrate visual data processing, expanding their real-world applications, but also increasing the risk of generating unsafe responses. In response, leading companies have implemented Multi-Layered safety defenses, including alignment training, safety system prompts, and content moderation. However, their effectiveness against sophisticated adversarial attacks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05772v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05772v1-abstract-full" style="display: none;"> Vision Large Language Models (VLLMs) integrate visual data processing, expanding their real-world applications, but also increasing the risk of generating unsafe responses. In response, leading companies have implemented Multi-Layered safety defenses, including alignment training, safety system prompts, and content moderation. However, their effectiveness against sophisticated adversarial attacks remains largely unexplored. In this paper, we propose MultiFaceted Attack, a novel attack framework designed to systematically bypass Multi-Layered Defenses in VLLMs. It comprises three complementary attack facets: Visual Attack that exploits the multimodal nature of VLLMs to inject toxic system prompts through images; Alignment Breaking Attack that manipulates the model's alignment mechanism to prioritize the generation of contrasting responses; and Adversarial Signature that deceives content moderators by strategically placing misleading information at the end of the response. Extensive evaluations on eight commercial VLLMs in a black-box setting demonstrate that MultiFaceted Attack achieves a 61.56% attack success rate, surpassing state-of-the-art methods by at least 42.18%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05772v1-abstract-full').style.display = 'none'; document.getElementById('2502.05772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16235">arXiv:2501.16235</a> <span> [<a href="https://arxiv.org/pdf/2501.16235">pdf</a>, <a href="https://arxiv.org/format/2501.16235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Echoes of Discord: Forecasting Hater Reactions to Counterspeech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoying Song</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+S+L">Sharon Lisseth Perez</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xinchen Yu</a>, <a href="/search/cs?searchtype=author&query=Blanco%2C+E">Eduardo Blanco</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingzi Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16235v2-abstract-short" style="display: inline;"> Hate speech (HS) erodes the inclusiveness of online users and propagates negativity and division. Counterspeech has been recognized as a way to mitigate the harmful consequences. While some research has investigated the impact of user-generated counterspeech on social media platforms, few have examined and modeled haters' reactions toward counterspeech, despite the immediate alteration of haters'… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16235v2-abstract-full').style.display = 'inline'; document.getElementById('2501.16235v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16235v2-abstract-full" style="display: none;"> Hate speech (HS) erodes the inclusiveness of online users and propagates negativity and division. Counterspeech has been recognized as a way to mitigate the harmful consequences. While some research has investigated the impact of user-generated counterspeech on social media platforms, few have examined and modeled haters' reactions toward counterspeech, despite the immediate alteration of haters' attitudes being an important aspect of counterspeech. This study fills the gap by analyzing the impact of counterspeech from the hater's perspective, focusing on whether the counterspeech leads the hater to reenter the conversation and if the reentry is hateful. We compile the Reddit Echoes of Hate dataset (ReEco), which consists of triple-turn conversations featuring haters' reactions, to assess the impact of counterspeech. To predict haters' behaviors, we employ two strategies: a two-stage reaction predictor and a three-way classifier. The linguistic analysis sheds insights on the language of counterspeech to hate eliciting different haters' reactions. Experimental results demonstrate that the 3-way classification model outperforms the two-stage reaction predictor, which first predicts reentry and then determines the reentry type. We conclude the study with an assessment showing the most common errors identified by the best-performing model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16235v2-abstract-full').style.display = 'none'; document.getElementById('2501.16235v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15363">arXiv:2501.15363</a> <span> [<a href="https://arxiv.org/pdf/2501.15363">pdf</a>, <a href="https://arxiv.org/format/2501.15363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AI-Driven Secure Data Sharing: A Trustworthy and Privacy-Preserving Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Amin%2C+A">Al Amin</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+K">Kamrul Hasan</a>, <a href="/search/cs?searchtype=author&query=Ullah%2C+S">Sharif Ullah</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15363v1-abstract-short" style="display: inline;"> In the era of data-driven decision-making, ensuring the privacy and security of shared data is paramount across various domains. Applying existing deep neural networks (DNNs) to encrypted data is critical and often compromises performance, security, and computational overhead. To address these limitations, this research introduces a secure framework consisting of a learnable encryption method base… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15363v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15363v1-abstract-full" style="display: none;"> In the era of data-driven decision-making, ensuring the privacy and security of shared data is paramount across various domains. Applying existing deep neural networks (DNNs) to encrypted data is critical and often compromises performance, security, and computational overhead. To address these limitations, this research introduces a secure framework consisting of a learnable encryption method based on the block-pixel operation to encrypt the data and subsequently integrate it with the Vision Transformer (ViT). The proposed framework ensures data privacy and security by creating unique scrambling patterns per key, providing robust performance against adversarial attacks without compromising computational efficiency and data integrity. The framework was tested on sensitive medical datasets to validate its efficacy, proving its ability to handle highly confidential information securely. The suggested framework was validated with a 94\% success rate after extensive testing on real-world datasets, such as MRI brain tumors and histological scans of lung and colon cancers. Additionally, the framework was tested under diverse adversarial attempts against secure data sharing with optimum performance and demonstrated its effectiveness in various threat scenarios. These comprehensive analyses underscore its robustness, making it a trustworthy solution for secure data sharing in critical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15363v1-abstract-full').style.display = 'none'; document.getElementById('2501.15363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14999">arXiv:2501.14999</a> <span> [<a href="https://arxiv.org/pdf/2501.14999">pdf</a>, <a href="https://arxiv.org/format/2501.14999">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VideoPure: Diffusion-based Adversarial Purification for Video Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kaixun Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+J">Jiyuan Fu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinglun Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14999v1-abstract-short" style="display: inline;"> Recent work indicates that video recognition models are vulnerable to adversarial examples, posing a serious security risk to downstream applications. However, current research has primarily focused on adversarial attacks, with limited work exploring defense mechanisms. Furthermore, due to the spatial-temporal complexity of videos, existing video defense methods face issues of high cost, overfitti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14999v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14999v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14999v1-abstract-full" style="display: none;"> Recent work indicates that video recognition models are vulnerable to adversarial examples, posing a serious security risk to downstream applications. However, current research has primarily focused on adversarial attacks, with limited work exploring defense mechanisms. Furthermore, due to the spatial-temporal complexity of videos, existing video defense methods face issues of high cost, overfitting, and limited defense performance. Recently, diffusion-based adversarial purification methods have achieved robust defense performance in the image domain. However, due to the additional temporal dimension in videos, directly applying these diffusion-based adversarial purification methods to the video domain suffers performance and efficiency degradation. To achieve an efficient and effective video adversarial defense method, we propose the first diffusion-based video purification framework to improve video recognition models' adversarial robustness: VideoPure. Given an adversarial example, we first employ temporal DDIM inversion to transform the input distribution into a temporally consistent and trajectory-defined distribution, covering adversarial noise while preserving more video structure. Then, during DDIM denoising, we leverage intermediate results at each denoising step and conduct guided spatial-temporal optimization, removing adversarial noise while maintaining temporal consistency. Finally, we input the list of optimized intermediate results into the video recognition model for multi-step voting to obtain the predicted class. We investigate the defense performance of our method against black-box, gray-box, and adaptive attacks on benchmark datasets and models. Compared with other adversarial purification methods, our method overall demonstrates better defense performance against different attacks. Our code is available at https://github.com/deep-kaixun/VideoPure. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14999v1-abstract-full').style.display = 'none'; document.getElementById('2501.14999v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02467">arXiv:2501.02467</a> <span> [<a href="https://arxiv.org/pdf/2501.02467">pdf</a>, <a href="https://arxiv.org/format/2501.02467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DeTrack: In-model Latent Denoising Learning for Visual Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinglun Li</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kaixun Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pinxue Guo</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+W">Weifeng Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02467v1-abstract-short" style="display: inline;"> Previous visual object tracking methods employ image-feature regression models or coordinate autoregression models for bounding box prediction. Image-feature regression methods heavily depend on matching results and do not utilize positional prior, while the autoregressive approach can only be trained using bounding boxes available in the training set, potentially resulting in suboptimal performan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02467v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02467v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02467v1-abstract-full" style="display: none;"> Previous visual object tracking methods employ image-feature regression models or coordinate autoregression models for bounding box prediction. Image-feature regression methods heavily depend on matching results and do not utilize positional prior, while the autoregressive approach can only be trained using bounding boxes available in the training set, potentially resulting in suboptimal performance during testing with unseen data. Inspired by the diffusion model, denoising learning enhances the model's robustness to unseen data. Therefore, We introduce noise to bounding boxes, generating noisy boxes for training, thus enhancing model robustness on testing data. We propose a new paradigm to formulate the visual object tracking problem as a denoising learning process. However, tracking algorithms are usually asked to run in real-time, directly applying the diffusion model to object tracking would severely impair tracking speed. Therefore, we decompose the denoising learning process into every denoising block within a model, not by running the model multiple times, and thus we summarize the proposed paradigm as an in-model latent denoising learning process. Specifically, we propose a denoising Vision Transformer (ViT), which is composed of multiple denoising blocks. In the denoising block, template and search embeddings are projected into every denoising block as conditions. A denoising block is responsible for removing the noise in a predicted bounding box, and multiple stacked denoising blocks cooperate to accomplish the whole denoising process. Subsequently, we utilize image features and trajectory information to refine the denoised bounding box. Besides, we also utilize trajectory memory and visual memory to improve tracking stability. Experimental results validate the effectiveness of our approach, achieving competitive performance on several challenging datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02467v1-abstract-full').style.display = 'none'; document.getElementById('2501.02467v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19533">arXiv:2412.19533</a> <span> [<a href="https://arxiv.org/pdf/2412.19533">pdf</a>, <a href="https://arxiv.org/format/2412.19533">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> P3S-Diffusion:A Selective Subject-driven Generation Framework via Point Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junjie Hu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shuyong Gao</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qishan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuzhou Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19533v2-abstract-short" style="display: inline;"> Recent research in subject-driven generation increasingly emphasizes the importance of selective subject features. Nevertheless, accurately selecting the content in a given reference image still poses challenges, especially when selecting the similar subjects in an image (e.g., two different dogs). Some methods attempt to use text prompts or pixel masks to isolate specific elements. However, text… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19533v2-abstract-full').style.display = 'inline'; document.getElementById('2412.19533v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19533v2-abstract-full" style="display: none;"> Recent research in subject-driven generation increasingly emphasizes the importance of selective subject features. Nevertheless, accurately selecting the content in a given reference image still poses challenges, especially when selecting the similar subjects in an image (e.g., two different dogs). Some methods attempt to use text prompts or pixel masks to isolate specific elements. However, text prompts often fall short in precisely describing specific content, and pixel masks are often expensive. To address this, we introduce P3S-Diffusion, a novel architecture designed for context-selected subject-driven generation via point supervision. P3S-Diffusion leverages minimal cost label (e.g., points) to generate subject-driven images. During fine-tuning, it can generate an expanded base mask from these points, obviating the need for additional segmentation models. The mask is employed for inpainting and aligning with subject representation. The P3S-Diffusion preserves fine features of the subjects through Multi-layers Condition Injection. Enhanced by the Attention Consistency Loss for improved training, extensive experiments demonstrate its excellent feature preservation and image generation capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19533v2-abstract-full').style.display = 'none'; document.getElementById('2412.19533v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02090">arXiv:2412.02090</a> <span> [<a href="https://arxiv.org/pdf/2412.02090">pdf</a>, <a href="https://arxiv.org/format/2412.02090">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> </div> </div> <p class="title is-5 mathjax"> MEP-Net: Generating Solutions to Scientific Problems with Limited Knowledge by Maximum Entropy Principle </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wuyue Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+L">Liangrong Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guojie Li</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liu Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02090v1-abstract-short" style="display: inline;"> Maximum entropy principle (MEP) offers an effective and unbiased approach to inferring unknown probability distributions when faced with incomplete information, while neural networks provide the flexibility to learn complex distributions from data. This paper proposes a novel neural network architecture, the MEP-Net, which combines the MEP with neural networks to generate probability distributions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02090v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02090v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02090v1-abstract-full" style="display: none;"> Maximum entropy principle (MEP) offers an effective and unbiased approach to inferring unknown probability distributions when faced with incomplete information, while neural networks provide the flexibility to learn complex distributions from data. This paper proposes a novel neural network architecture, the MEP-Net, which combines the MEP with neural networks to generate probability distributions from moment constraints. We also provide a comprehensive overview of the fundamentals of the maximum entropy principle, its mathematical formulations, and a rigorous justification for its applicability for non-equilibrium systems based on the large deviations principle. Through fruitful numerical experiments, we demonstrate that the MEP-Net can be particularly useful in modeling the evolution of probability distributions in biochemical reaction networks and in generating complex distributions from data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02090v1-abstract-full').style.display = 'none'; document.getElementById('2412.02090v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 6 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19757">arXiv:2411.19757</a> <span> [<a href="https://arxiv.org/pdf/2411.19757">pdf</a>, <a href="https://arxiv.org/format/2411.19757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dual Risk Minimization: Towards Next-Level Robustness in Fine-tuning Zero-Shot Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaican Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Weiyan Xie</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongxiang Huang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+D">Didan Deng</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a>, <a href="/search/cs?searchtype=author&query=Silva%2C+R">Ricardo Silva</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N+L">Nevin L. Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19757v1-abstract-short" style="display: inline;"> Fine-tuning foundation models often compromises their robustness to distribution shifts. To remedy this, most robust fine-tuning methods aim to preserve the pre-trained features. However, not all pre-trained features are robust and those methods are largely indifferent to which ones to preserve. We propose dual risk minimization (DRM), which combines empirical risk minimization with worst-case ris… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19757v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19757v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19757v1-abstract-full" style="display: none;"> Fine-tuning foundation models often compromises their robustness to distribution shifts. To remedy this, most robust fine-tuning methods aim to preserve the pre-trained features. However, not all pre-trained features are robust and those methods are largely indifferent to which ones to preserve. We propose dual risk minimization (DRM), which combines empirical risk minimization with worst-case risk minimization, to better preserve the core features of downstream tasks. In particular, we utilize core-feature descriptions generated by LLMs to induce core-based zero-shot predictions which then serve as proxies to estimate the worst-case risk. DRM balances two crucial aspects of model robustness: expected performance and worst-case performance, establishing a new state of the art on various real-world benchmarks. DRM significantly improves the out-of-distribution performance of CLIP ViT-L/14@336 on ImageNet (75.9 to 77.1), WILDS-iWildCam (47.1 to 51.8), and WILDS-FMoW (50.7 to 53.1); opening up new avenues for robust fine-tuning. Our code is available at https://github.com/vaynexie/DRM . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19757v1-abstract-full').style.display = 'none'; document.getElementById('2411.19757v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13807">arXiv:2411.13807</a> <span> [<a href="https://arxiv.org/pdf/2411.13807">pdf</a>, <a href="https://arxiv.org/format/2411.13807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MagicDriveDiT: High-Resolution Long Video Generation for Autonomous Driving with Adaptive Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+R">Ruiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+B">Bo Xiao</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13807v1-abstract-short" style="display: inline;"> The rapid advancement of diffusion models has greatly improved video synthesis, especially in controllable video generation, which is essential for applications like autonomous driving. However, existing methods are limited by scalability and how control conditions are integrated, failing to meet the needs for high-resolution and long videos for autonomous driving applications. In this paper, we i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13807v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13807v1-abstract-full" style="display: none;"> The rapid advancement of diffusion models has greatly improved video synthesis, especially in controllable video generation, which is essential for applications like autonomous driving. However, existing methods are limited by scalability and how control conditions are integrated, failing to meet the needs for high-resolution and long videos for autonomous driving applications. In this paper, we introduce MagicDriveDiT, a novel approach based on the DiT architecture, and tackle these challenges. Our method enhances scalability through flow matching and employs a progressive training strategy to manage complex scenarios. By incorporating spatial-temporal conditional encoding, MagicDriveDiT achieves precise control over spatial-temporal latents. Comprehensive experiments show its superior performance in generating realistic street scene videos with higher resolution and more frames. MagicDriveDiT significantly improves video generation quality and spatial-temporal controls, expanding its potential applications across various tasks in autonomous driving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13807v1-abstract-full').style.display = 'none'; document.getElementById('2411.13807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Website: https://flymin.github.io/magicdrivedit/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13425">arXiv:2411.13425</a> <span> [<a href="https://arxiv.org/pdf/2411.13425">pdf</a>, <a href="https://arxiv.org/format/2411.13425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> WaterPark: A Robustness Assessment of Language Model Watermarking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jiacheng Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zian Wang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lauren Hong</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shouling Ji</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Ting Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13425v2-abstract-short" style="display: inline;"> Various watermarking methods (``watermarkers'') have been proposed to identify LLM-generated texts; yet, due to the lack of unified evaluation platforms, many critical questions remain under-explored: i) What are the strengths/limitations of various watermarkers, especially their attack robustness? ii) How do various design choices impact their robustness? iii) How to optimally operate watermarker… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13425v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13425v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13425v2-abstract-full" style="display: none;"> Various watermarking methods (``watermarkers'') have been proposed to identify LLM-generated texts; yet, due to the lack of unified evaluation platforms, many critical questions remain under-explored: i) What are the strengths/limitations of various watermarkers, especially their attack robustness? ii) How do various design choices impact their robustness? iii) How to optimally operate watermarkers in adversarial environments? To fill this gap, we systematize existing LLM watermarkers and watermark removal attacks, mapping out their design spaces. We then develop WaterPark, a unified platform that integrates 10 state-of-the-art watermarkers and 12 representative attacks. More importantly, by leveraging WaterPark, we conduct a comprehensive assessment of existing watermarkers, unveiling the impact of various design choices on their attack robustness. We further explore the best practices to operate watermarkers in adversarial environments. We believe our study sheds light on current LLM watermarking techniques while WaterPark serves as a valuable testbed to facilitate future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13425v2-abstract-full').style.display = 'none'; document.getElementById('2411.13425v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11930">arXiv:2411.11930</a> <span> [<a href="https://arxiv.org/pdf/2411.11930">pdf</a>, <a href="https://arxiv.org/format/2411.11930">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AtomThink: A Slow Thinking Framework for Multimodal Mathematical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+K">Kun Xiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhili Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihao Jiang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Y">Yunshuang Nie</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Runhui Huang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Haoxiang Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hanhui Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiran Huang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yihan Zeng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jianhua Han</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11930v3-abstract-short" style="display: inline;"> In this paper, we address the challenging task of multimodal mathematical reasoning by incorporating the ability of ``slow thinking" into multimodal large language models (MLLMs). Contrary to existing methods that rely on direct or fast thinking, our key idea is to construct long chains of thought (CoT) consisting of atomic actions in a step-by-step manner, guiding MLLMs to perform complex reasoni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11930v3-abstract-full').style.display = 'inline'; document.getElementById('2411.11930v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11930v3-abstract-full" style="display: none;"> In this paper, we address the challenging task of multimodal mathematical reasoning by incorporating the ability of ``slow thinking" into multimodal large language models (MLLMs). Contrary to existing methods that rely on direct or fast thinking, our key idea is to construct long chains of thought (CoT) consisting of atomic actions in a step-by-step manner, guiding MLLMs to perform complex reasoning. To this end, we design a novel AtomThink framework composed of three key modules: (i) a CoT annotation engine that automatically generates high-quality CoT annotations to address the lack of high-quality visual mathematical data; (ii) an atomic step fine-tuning strategy that jointly optimizes an MLLM and a policy reward model (PRM) for step-wise reasoning; and (iii) four different search strategies that can be applied with the PRM to complete reasoning. Additionally, we propose AtomMATH, a large-scale multimodal dataset of long CoTs, and an atomic capability evaluation metric for mathematical tasks. Extensive experimental results show that the proposed AtomThink significantly improves the performance of baseline MLLMs, achieving approximately 50\% relative accuracy gains on MathVista and 120\% on MathVerse. To support the advancement of multimodal slow-thinking models, we will make our code and dataset publicly available on https://github.com/Quinn777/AtomThink. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11930v3-abstract-full').style.display = 'none'; document.getElementById('2411.11930v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21127">arXiv:2410.21127</a> <span> [<a href="https://arxiv.org/pdf/2410.21127">pdf</a>, <a href="https://arxiv.org/format/2410.21127">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Retrieval-Enhanced Mutation Mastery: Augmenting Zero-Shot Prediction of Protein Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yang Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruilin Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Banghao Wu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bingxin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21127v1-abstract-short" style="display: inline;"> Enzyme engineering enables the modification of wild-type proteins to meet industrial and research demands by enhancing catalytic activity, stability, binding affinities, and other properties. The emergence of deep learning methods for protein modeling has demonstrated superior results at lower costs compared to traditional approaches such as directed evolution and rational design. In mutation effe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21127v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21127v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21127v1-abstract-full" style="display: none;"> Enzyme engineering enables the modification of wild-type proteins to meet industrial and research demands by enhancing catalytic activity, stability, binding affinities, and other properties. The emergence of deep learning methods for protein modeling has demonstrated superior results at lower costs compared to traditional approaches such as directed evolution and rational design. In mutation effect prediction, the key to pre-training deep learning models lies in accurately interpreting the complex relationships among protein sequence, structure, and function. This study introduces a retrieval-enhanced protein language model for comprehensive analysis of native properties from sequence and local structural interactions, as well as evolutionary properties from retrieved homologous sequences. The state-of-the-art performance of the proposed ProtREM is validated on over 2 million mutants across 217 assays from an open benchmark (ProteinGym). We also conducted post-hoc analyses of the model's ability to improve the stability and binding affinity of a VHH antibody. Additionally, we designed 10 new mutants on a DNA polymerase and conducted wet-lab experiments to evaluate their enhanced activity at higher temperatures. Both in silico and experimental evaluations confirmed that our method provides reliable predictions of mutation effects, offering an auxiliary tool for biologists aiming to evolve existing enzymes. The implementation is publicly available at https://github.com/tyang816/ProtREM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21127v1-abstract-full').style.display = 'none'; document.getElementById('2410.21127v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 10 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20178">arXiv:2410.20178</a> <span> [<a href="https://arxiv.org/pdf/2410.20178">pdf</a>, <a href="https://arxiv.org/format/2410.20178">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LLMs Can Evolve Continually on Modality for X-Modal Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiazuo Yu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Haomiao Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lu Zhang</a>, <a href="/search/cs?searchtype=author&query=Diao%2C+H">Haiwen Diao</a>, <a href="/search/cs?searchtype=author&query=Zhuge%2C+Y">Yunzhi Zhuge</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dong Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">You He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20178v2-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have gained significant attention due to their impressive capabilities in multimodal understanding. However, existing methods rely heavily on extensive modal-specific pretraining and joint-modal tuning, leading to significant computational burdens when expanding to new modalities. In this paper, we propose PathWeave, a flexible and scalable framework with m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20178v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20178v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20178v2-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have gained significant attention due to their impressive capabilities in multimodal understanding. However, existing methods rely heavily on extensive modal-specific pretraining and joint-modal tuning, leading to significant computational burdens when expanding to new modalities. In this paper, we propose PathWeave, a flexible and scalable framework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs to continually EVolve on modalities for $\mathbb{X}$-modal reasoning. We leverage the concept of Continual Learning and develop an incremental training strategy atop pre-trained MLLMs, enabling their expansion to new modalities using uni-modal data, without executing joint-modal pretraining. In detail, a novel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and cross-modal adapters are seamlessly integrated to facilitate efficient modality alignment and collaboration. Additionally, an MoE-based gating module is applied between two types of adapters to further enhance the multimodal interaction. To investigate the proposed method, we establish a challenging benchmark called Continual Learning of Modality (MCL), which consists of high-quality QA data from five distinct modalities: image, video, audio, depth and point cloud. Extensive experiments demonstrate the effectiveness of the proposed AnA framework on learning plasticity and memory stability during continual learning. Furthermore, PathWeave performs comparably to state-of-the-art MLLMs while concurrently reducing parameter training burdens by 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20178v2-abstract-full').style.display = 'none'; document.getElementById('2410.20178v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16458">arXiv:2410.16458</a> <span> [<a href="https://arxiv.org/pdf/2410.16458">pdf</a>, <a href="https://arxiv.org/format/2410.16458">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> STAR: A Simple Training-free Approach for Recommendations using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dong-Ho Lee</a>, <a href="/search/cs?searchtype=author&query=Kraft%2C+A">Adam Kraft</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+L">Long Jin</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+N">Nikhil Mehta</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Taibai Xu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lichan Hong</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+E+H">Ed H. Chi</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xinyang Yi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16458v1-abstract-short" style="display: inline;"> Recent progress in large language models (LLMs) offers promising new approaches for recommendation system (RecSys) tasks. While the current state-of-the-art methods rely on fine-tuning LLMs to achieve optimal results, this process is costly and introduces significant engineering complexities. Conversely, methods that bypass fine-tuning and use LLMs directly are less resource-intensive but often fa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16458v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16458v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16458v1-abstract-full" style="display: none;"> Recent progress in large language models (LLMs) offers promising new approaches for recommendation system (RecSys) tasks. While the current state-of-the-art methods rely on fine-tuning LLMs to achieve optimal results, this process is costly and introduces significant engineering complexities. Conversely, methods that bypass fine-tuning and use LLMs directly are less resource-intensive but often fail to fully capture both semantic and collaborative information, resulting in sub-optimal performance compared to their fine-tuned counterparts. In this paper, we propose a Simple Training-free Approach for Recommendation (STAR), a framework that utilizes LLMs and can be applied to various recommendation tasks without the need for fine-tuning. Our approach involves a retrieval stage that uses semantic embeddings from LLMs combined with collaborative user information to retrieve candidate items. We then apply an LLM for pairwise ranking to enhance next-item prediction. Experimental results on the Amazon Review dataset show competitive performance for next item prediction, even with our retrieval stage alone. Our full method achieves Hits@10 performance of +23.8% on Beauty, +37.5% on Toys and Games, and -1.8% on Sports and Outdoors relative to the best supervised models. This framework offers an effective alternative to traditional supervised models, highlighting the potential of LLMs in recommendation systems without extensive training or custom architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16458v1-abstract-full').style.display = 'none'; document.getElementById('2410.16458v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12245">arXiv:2410.12245</a> <span> [<a href="https://arxiv.org/pdf/2410.12245">pdf</a>, <a href="https://arxiv.org/format/2410.12245">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Advancing Healthcare: Innovative ML Approaches for Improved Medical Imaging in Data-Constrained Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Amin%2C+A">Al Amin</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+K">Kamrul Hasan</a>, <a href="/search/cs?searchtype=author&query=Zein-Sabatto%2C+S">Saleh Zein-Sabatto</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Shetty%2C+S">Sachin Shetty</a>, <a href="/search/cs?searchtype=author&query=Ahmed%2C+I">Imtiaz Ahmed</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+T">Tariqul Islam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12245v1-abstract-short" style="display: inline;"> Healthcare industries face challenges when experiencing rare diseases due to limited samples. Artificial Intelligence (AI) communities overcome this situation to create synthetic data which is an ethical and privacy issue in the medical domain. This research introduces the CAT-U-Net framework as a new approach to overcome these limitations, which enhances feature extraction from medical images wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12245v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12245v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12245v1-abstract-full" style="display: none;"> Healthcare industries face challenges when experiencing rare diseases due to limited samples. Artificial Intelligence (AI) communities overcome this situation to create synthetic data which is an ethical and privacy issue in the medical domain. This research introduces the CAT-U-Net framework as a new approach to overcome these limitations, which enhances feature extraction from medical images without the need for large datasets. The proposed framework adds an extra concatenation layer with downsampling parts, thereby improving its ability to learn from limited data while maintaining patient privacy. To validate, the proposed framework's robustness, different medical conditioning datasets were utilized including COVID-19, brain tumors, and wrist fractures. The framework achieved nearly 98% reconstruction accuracy, with a Dice coefficient close to 0.946. The proposed CAT-U-Net has the potential to make a big difference in medical image diagnostics in settings with limited data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12245v1-abstract-full').style.display = 'none'; document.getElementById('2410.12245v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11007">arXiv:2410.11007</a> <span> [<a href="https://arxiv.org/pdf/2410.11007">pdf</a>, <a href="https://arxiv.org/format/2410.11007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Assessing the Human Likeness of AI-Generated Counterspeech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoying Song</a>, <a href="/search/cs?searchtype=author&query=Mamidisetty%2C+S">Sujana Mamidisetty</a>, <a href="/search/cs?searchtype=author&query=Blanco%2C+E">Eduardo Blanco</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingzi Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11007v2-abstract-short" style="display: inline;"> Counterspeech is a targeted response to counteract and challenge abusive or hateful content. It effectively curbs the spread of hatred and fosters constructive online communication. Previous studies have proposed different strategies for automatically generated counterspeech. Evaluations, however, focus on relevance, surface form, and other shallow linguistic characteristics. This paper investigat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11007v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11007v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11007v2-abstract-full" style="display: none;"> Counterspeech is a targeted response to counteract and challenge abusive or hateful content. It effectively curbs the spread of hatred and fosters constructive online communication. Previous studies have proposed different strategies for automatically generated counterspeech. Evaluations, however, focus on relevance, surface form, and other shallow linguistic characteristics. This paper investigates the human likeness of AI-generated counterspeech, a critical factor influencing effectiveness. We implement and evaluate several LLM-based generation strategies, and discover that AI-generated and human-written counterspeech can be easily distinguished by both simple classifiers and humans. Further, we reveal differences in linguistic characteristics, politeness, and specificity. The dataset used in this study is publicly available for further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11007v2-abstract-full').style.display = 'none'; document.getElementById('2410.11007v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for presentation at the COLING 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> COLING 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03403">arXiv:2410.03403</a> <span> [<a href="https://arxiv.org/pdf/2410.03403">pdf</a>, <a href="https://arxiv.org/format/2410.03403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Distributed Networked Multi-task Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingzhou Hong</a>, <a href="/search/cs?searchtype=author&query=Garcia%2C+A">Alfredo Garcia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03403v1-abstract-short" style="display: inline;"> We consider a distributed multi-task learning scheme that accounts for multiple linear model estimation tasks with heterogeneous and/or correlated data streams. We assume that nodes can be partitioned into groups corresponding to different learning tasks and communicate according to a directed network topology. Each node estimates a linear model asynchronously and is subject to local (within-group… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03403v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03403v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03403v1-abstract-full" style="display: none;"> We consider a distributed multi-task learning scheme that accounts for multiple linear model estimation tasks with heterogeneous and/or correlated data streams. We assume that nodes can be partitioned into groups corresponding to different learning tasks and communicate according to a directed network topology. Each node estimates a linear model asynchronously and is subject to local (within-group) regularization and global (across groups) regularization terms targeting noise reduction and generalization performance improvement respectively. We provide a finite-time characterization of convergence of the estimators and task relation and illustrate the scheme's general applicability in two examples: random field temperature estimation and modeling student performance from different academic districts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03403v1-abstract-full').style.display = 'none'; document.getElementById('2410.03403v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02647">arXiv:2410.02647</a> <span> [<a href="https://arxiv.org/pdf/2410.02647">pdf</a>, <a href="https://arxiv.org/format/2410.02647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Immunogenicity Prediction with Dual Attention Enables Vaccine Target Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Song Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yang Tan</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+S">Song Ke</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bingxin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02647v1-abstract-short" style="display: inline;"> Immunogenicity prediction is a central topic in reverse vaccinology for finding candidate vaccines that can trigger protective immune responses. Existing approaches typically rely on highly compressed features and simple model architectures, leading to limited prediction accuracy and poor generalizability. To address these challenges, we introduce ProVaccine, a novel deep learning solution with a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02647v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02647v1-abstract-full" style="display: none;"> Immunogenicity prediction is a central topic in reverse vaccinology for finding candidate vaccines that can trigger protective immune responses. Existing approaches typically rely on highly compressed features and simple model architectures, leading to limited prediction accuracy and poor generalizability. To address these challenges, we introduce ProVaccine, a novel deep learning solution with a dual attention mechanism that integrates pre-trained latent vector representations of protein sequences and structures. We also compile the most comprehensive immunogenicity dataset to date, encompassing over 9,500 antigen sequences, structures, and immunogenicity labels from bacteria, viruses, and tumors. Extensive experiments demonstrate that ProVaccine outperforms existing methods across a wide range of evaluation metrics. Furthermore, we establish a post-hoc validation protocol to assess the practical significance of deep learning models in tackling vaccine design challenges. Our work provides an effective tool for vaccine design and sets valuable benchmarks for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02647v1-abstract-full').style.display = 'none'; document.getElementById('2410.02647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 11 tables, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19342">arXiv:2409.19342</a> <span> [<a href="https://arxiv.org/pdf/2409.19342">pdf</a>, <a href="https://arxiv.org/format/2409.19342">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> X-Prompt: Multi-modal Visual Prompt for Video Object Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pinxue Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wanyun Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hao Huang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinglun Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kaixun Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19342v1-abstract-short" style="display: inline;"> Multi-modal Video Object Segmentation (VOS), including RGB-Thermal, RGB-Depth, and RGB-Event, has garnered attention due to its capability to address challenging scenarios where traditional VOS methods struggle, such as extreme illumination, rapid motion, and background distraction. Existing approaches often involve designing specific additional branches and performing full-parameter fine-tuning f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19342v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19342v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19342v1-abstract-full" style="display: none;"> Multi-modal Video Object Segmentation (VOS), including RGB-Thermal, RGB-Depth, and RGB-Event, has garnered attention due to its capability to address challenging scenarios where traditional VOS methods struggle, such as extreme illumination, rapid motion, and background distraction. Existing approaches often involve designing specific additional branches and performing full-parameter fine-tuning for fusion in each task. However, this paradigm not only duplicates research efforts and hardware costs but also risks model collapse with the limited multi-modal annotated data. In this paper, we propose a universal framework named X-Prompt for all multi-modal video object segmentation tasks, designated as RGB+X. The X-Prompt framework first pre-trains a video object segmentation foundation model using RGB data, and then utilize the additional modality of the prompt to adapt it to downstream multi-modal tasks with limited data. Within the X-Prompt framework, we introduce the Multi-modal Visual Prompter (MVP), which allows prompting foundation model with the various modalities to segment objects precisely. We further propose the Multi-modal Adaptation Experts (MAEs) to adapt the foundation model with pluggable modality-specific knowledge without compromising the generalization capacity. To evaluate the effectiveness of the X-Prompt framework, we conduct extensive experiments on 3 tasks across 4 benchmarks. The proposed universal X-Prompt framework consistently outperforms the full fine-tuning paradigm and achieves state-of-the-art performance. Code: https://github.com/PinxueGuo/X-Prompt.git <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19342v1-abstract-full').style.display = 'none'; document.getElementById('2409.19342v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACMMM'2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18042">arXiv:2409.18042</a> <span> [<a href="https://arxiv.org/pdf/2409.18042">pdf</a>, <a href="https://arxiv.org/format/2409.18042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EMOVA: Empowering Language Models to See, Hear and Speak with Vivid Emotions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+Y">Yunhao Gou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Runhui Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhili Liu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+D">Daxin Tan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chunwei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yihan Zeng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kuo Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dingdong Wang</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+K">Kun Xiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoyuan Li</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+H">Haoli Bai</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jianhua Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohui Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+W">Weike Jin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+N">Nian Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James T. Kwok</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hengshuang Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+D">Dit-Yan Yeung</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a> , et al. (6 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18042v2-abstract-short" style="display: inline;"> GPT-4o, an omni-modal model that enables vocal conversations with diverse emotions and tones, marks a milestone for omni-modal foundation models. However, empowering Large Language Models to perceive and generate images, texts, and speeches end-to-end with publicly available data remains challenging in the open-source community. Existing vision-language models rely on external tools for the speech… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18042v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18042v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18042v2-abstract-full" style="display: none;"> GPT-4o, an omni-modal model that enables vocal conversations with diverse emotions and tones, marks a milestone for omni-modal foundation models. However, empowering Large Language Models to perceive and generate images, texts, and speeches end-to-end with publicly available data remains challenging in the open-source community. Existing vision-language models rely on external tools for the speech processing, while speech-language models still suffer from limited or even without vision-understanding abilities. To address this gap, we propose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large Language Models with end-to-end speech capabilities while maintaining the leading vision-language performance. With a semantic-acoustic disentangled speech tokenizer, we notice surprisingly that omni-modal alignment can further enhance vision-language and speech abilities compared with the corresponding bi-modal aligned counterparts. Moreover, a lightweight style module is proposed for flexible speech style controls (e.g., emotions and pitches). For the first time, EMOVA achieves state-of-the-art performance on both the vision-language and speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue with vivid emotions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18042v2-abstract-full').style.display = 'none'; document.getElementById('2409.18042v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://emova-ollm.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17564">arXiv:2409.17564</a> <span> [<a href="https://arxiv.org/pdf/2409.17564">pdf</a>, <a href="https://arxiv.org/format/2409.17564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> General Compression Framework for Efficient Transformer Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinglun Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shilin Yan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pinxue Guo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kaixun Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shuyong Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hong Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17564v1-abstract-short" style="display: inline;"> Transformer-based trackers have established a dominant role in the field of visual object tracking. While these trackers exhibit promising performance, their deployment on resource-constrained devices remains challenging due to inefficiencies. To improve the inference efficiency and reduce the computation cost, prior approaches have aimed to either design lightweight trackers or distill knowledge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17564v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17564v1-abstract-full" style="display: none;"> Transformer-based trackers have established a dominant role in the field of visual object tracking. While these trackers exhibit promising performance, their deployment on resource-constrained devices remains challenging due to inefficiencies. To improve the inference efficiency and reduce the computation cost, prior approaches have aimed to either design lightweight trackers or distill knowledge from larger teacher models into more compact student trackers. However, these solutions often sacrifice accuracy for speed. Thus, we propose a general model compression framework for efficient transformer object tracking, named CompressTracker, to reduce the size of a pre-trained tracking model into a lightweight tracker with minimal performance degradation. Our approach features a novel stage division strategy that segments the transformer layers of the teacher model into distinct stages, enabling the student model to emulate each corresponding teacher stage more effectively. Additionally, we also design a unique replacement training technique that involves randomly substituting specific stages in the student model with those from the teacher model, as opposed to training the student model in isolation. Replacement training enhances the student model's ability to replicate the teacher model's behavior. To further forcing student model to emulate teacher model, we incorporate prediction guidance and stage-wise feature mimicking to provide additional supervision during the teacher model's compression process. Our framework CompressTracker is structurally agnostic, making it compatible with any transformer architecture. We conduct a series of experiment to verify the effectiveness and generalizability of CompressTracker. Our CompressTracker-4 with 4 transformer layers, which is compressed from OSTrack, retains about 96% performance on LaSOT (66.1% AUC) while achieves 2.17x speed up. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17564v1-abstract-full').style.display = 'none'; document.getElementById('2409.17564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11365">arXiv:2409.11365</a> <span> [<a href="https://arxiv.org/pdf/2409.11365">pdf</a>, <a href="https://arxiv.org/format/2409.11365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CoCA: Regaining Safety-awareness of Multimodal Large Language Models with Constitutional Calibration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jiahui Gao</a>, <a href="/search/cs?searchtype=author&query=Pi%2C+R">Renjie Pi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Tianyang Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Han Wu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingpeng Kong</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xin Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11365v2-abstract-short" style="display: inline;"> The deployment of multimodal large language models (MLLMs) has demonstrated remarkable success in engaging in conversations involving visual inputs, thanks to the superior power of large language models (LLMs). Those MLLMs are typically built based on the LLMs, with an image encoder to process images into the token embedding space of the LLMs. However, the integration of visual modality has introd… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11365v2-abstract-full').style.display = 'inline'; document.getElementById('2409.11365v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11365v2-abstract-full" style="display: none;"> The deployment of multimodal large language models (MLLMs) has demonstrated remarkable success in engaging in conversations involving visual inputs, thanks to the superior power of large language models (LLMs). Those MLLMs are typically built based on the LLMs, with an image encoder to process images into the token embedding space of the LLMs. However, the integration of visual modality has introduced a unique vulnerability: the MLLM becomes susceptible to malicious visual inputs and prone to generating sensitive or harmful responses, even though the LLM has been trained on textual dataset to align with human value. In this paper, we first raise the question: ``Do the MLLMs possess safety-awareness against malicious image inputs?". We find that after adding a principle that specifies the safety requirement into the input of the MLLM, the model's safety awareness becomes boosted. This phenomenon verifies the existence of MLLM's safety-awareness against image inputs, it is only weakened by the modality gap. We then introduce a simple yet effective technique termed CoCA, which amplifies the safety-awareness of the MLLM by calibrating its output distribution. Our proposed strategy helps the model reclaim its original safety awareness without losing its original capabilities. We verify the effectiveness of our approach on both multimodal safety and understanding benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11365v2-abstract-full').style.display = 'none'; document.getElementById('2409.11365v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, COLM-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05847">arXiv:2409.05847</a> <span> [<a href="https://arxiv.org/pdf/2409.05847">pdf</a>, <a href="https://arxiv.org/format/2409.05847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LSVOS Challenge Report: Large-scale Complex and Long Video Object Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+H">Henghui Ding</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Ning Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linjie Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yuchen Fan</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+D">Deshui Miao</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yameng Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhenyu He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+J">Jinming Chai</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qin Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junpei Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+L">Licheng Jiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kexin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">LingLing Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hao Fang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+F">Feiyu Pan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xiankai Lu</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05847v1-abstract-short" style="display: inline;"> Despite the promising performance of current video segmentation models on existing benchmarks, these models still struggle with complex scenes. In this paper, we introduce the 6th Large-scale Video Object Segmentation (LSVOS) challenge in conjunction with ECCV 2024 workshop. This year's challenge includes two tasks: Video Object Segmentation (VOS) and Referring Video Object Segmentation (RVOS). In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05847v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05847v1-abstract-full" style="display: none;"> Despite the promising performance of current video segmentation models on existing benchmarks, these models still struggle with complex scenes. In this paper, we introduce the 6th Large-scale Video Object Segmentation (LSVOS) challenge in conjunction with ECCV 2024 workshop. This year's challenge includes two tasks: Video Object Segmentation (VOS) and Referring Video Object Segmentation (RVOS). In this year, we replace the classic YouTube-VOS and YouTube-RVOS benchmark with latest datasets MOSE, LVOS, and MeViS to assess VOS under more challenging complex environments. This year's challenge attracted 129 registered teams from more than 20 institutes across over 8 countries. This report include the challenge and dataset introduction, and the methods used by top 7 teams in two tracks. More details can be found in our homepage https://lsvos.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05847v1-abstract-full').style.display = 'none'; document.getElementById('2409.05847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024 LSVOS Challenge Report: https://lsvos.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15566">arXiv:2408.15566</a> <span> [<a href="https://arxiv.org/pdf/2408.15566">pdf</a>, <a href="https://arxiv.org/format/2408.15566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TagOOD: A Novel Approach to Out-of-Distribution Detection via Vision-Language Representations and Class Center Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinglun Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kaixun Jiang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pinxue Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+W">Weifeng Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15566v1-abstract-short" style="display: inline;"> Multimodal fusion, leveraging data like vision and language, is rapidly gaining traction. This enriched data representation improves performance across various tasks. Existing methods for out-of-distribution (OOD) detection, a critical area where AI models encounter unseen data in real-world scenarios, rely heavily on whole-image features. These image-level features can include irrelevant informat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15566v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15566v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15566v1-abstract-full" style="display: none;"> Multimodal fusion, leveraging data like vision and language, is rapidly gaining traction. This enriched data representation improves performance across various tasks. Existing methods for out-of-distribution (OOD) detection, a critical area where AI models encounter unseen data in real-world scenarios, rely heavily on whole-image features. These image-level features can include irrelevant information that hinders the detection of OOD samples, ultimately limiting overall performance. In this paper, we propose \textbf{TagOOD}, a novel approach for OOD detection that leverages vision-language representations to achieve label-free object feature decoupling from whole images. This decomposition enables a more focused analysis of object semantics, enhancing OOD detection performance. Subsequently, TagOOD trains a lightweight network on the extracted object features to learn representative class centers. These centers capture the central tendencies of IND object classes, minimizing the influence of irrelevant image features during OOD detection. Finally, our approach efficiently detects OOD samples by calculating distance-based metrics as OOD scores between learned centers and test samples. We conduct extensive experiments to evaluate TagOOD on several benchmark datasets and demonstrate its superior performance compared to existing OOD detection methods. This work presents a novel perspective for further exploration of multimodal information utilization in OOD detection, with potential applications across various tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15566v1-abstract-full').style.display = 'none'; document.getElementById('2408.15566v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACMMM2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14678">arXiv:2408.14678</a> <span> [<a href="https://arxiv.org/pdf/2408.14678">pdf</a>, <a href="https://arxiv.org/format/2408.14678">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Gap: Unpacking the Hidden Challenges in Knowledge Distillation for Online Ranking Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khani%2C+N">Nikhil Khani</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Nath%2C+A">Aniruddh Nath</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Abbo%2C+P">Pendo Abbo</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+L">Li Wei</a>, <a href="/search/cs?searchtype=author&query=Andrews%2C+S">Shawn Andrews</a>, <a href="/search/cs?searchtype=author&query=Kula%2C+M">Maciej Kula</a>, <a href="/search/cs?searchtype=author&query=Kahn%2C+J">Jarrod Kahn</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhe Zhao</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lichan Hong</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+E">Ed Chi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14678v1-abstract-short" style="display: inline;"> Knowledge Distillation (KD) is a powerful approach for compressing a large model into a smaller, more efficient model, particularly beneficial for latency-sensitive applications like recommender systems. However, current KD research predominantly focuses on Computer Vision (CV) and NLP tasks, overlooking unique data characteristics and challenges inherent to recommender systems. This paper address… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14678v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14678v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14678v1-abstract-full" style="display: none;"> Knowledge Distillation (KD) is a powerful approach for compressing a large model into a smaller, more efficient model, particularly beneficial for latency-sensitive applications like recommender systems. However, current KD research predominantly focuses on Computer Vision (CV) and NLP tasks, overlooking unique data characteristics and challenges inherent to recommender systems. This paper addresses these overlooked challenges, specifically: (1) mitigating data distribution shifts between teacher and student models, (2) efficiently identifying optimal teacher configurations within time and budgetary constraints, and (3) enabling computationally efficient and rapid sharing of teacher labels to support multiple students. We present a robust KD system developed and rigorously evaluated on multiple large-scale personalized video recommendation systems within Google. Our live experiment results demonstrate significant improvements in student model performance while ensuring consistent and reliable generation of high quality teacher labels from a continuous data stream of data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14678v1-abstract-full').style.display = 'none'; document.getElementById('2408.14678v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13659">arXiv:2408.13659</a> <span> [<a href="https://arxiv.org/pdf/2408.13659">pdf</a>, <a href="https://arxiv.org/format/2408.13659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> ReactZyme: A Benchmark for Enzyme-Reaction Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+B">Bozitao Zhong</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuangjia Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13659v3-abstract-short" style="display: inline;"> Enzymes, with their specific catalyzed reactions, are necessary for all aspects of life, enabling diverse biological processes and adaptations. Predicting enzyme functions is essential for understanding biological pathways, guiding drug development, enhancing bioproduct yields, and facilitating evolutionary studies. Addressing the inherent complexities, we introduce a new approach to annotating en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13659v3-abstract-full').style.display = 'inline'; document.getElementById('2408.13659v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13659v3-abstract-full" style="display: none;"> Enzymes, with their specific catalyzed reactions, are necessary for all aspects of life, enabling diverse biological processes and adaptations. Predicting enzyme functions is essential for understanding biological pathways, guiding drug development, enhancing bioproduct yields, and facilitating evolutionary studies. Addressing the inherent complexities, we introduce a new approach to annotating enzymes based on their catalyzed reactions. This method provides detailed insights into specific reactions and is adaptable to newly discovered reactions, diverging from traditional classifications by protein family or expert-derived reaction classes. We employ machine learning algorithms to analyze enzyme reaction datasets, delivering a much more refined view on the functionality of enzymes. Our evaluation leverages the largest enzyme-reaction dataset to date, derived from the SwissProt and Rhea databases with entries up to January 8, 2024. We frame the enzyme-reaction prediction as a retrieval problem, aiming to rank enzymes by their catalytic ability for specific reactions. With our model, we can recruit proteins for novel reactions and predict reactions in novel proteins, facilitating enzyme discovery and function annotation (https://github.com/WillHua127/ReactZyme). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13659v3-abstract-full').style.display = 'none'; document.getElementById('2408.13659v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 38th Conference on Neural Information Processing Systems (NeurIPS 2024) Track on Datasets and Benchmarks </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09537">arXiv:2408.09537</a> <span> [<a href="https://arxiv.org/pdf/2408.09537">pdf</a>, <a href="https://arxiv.org/format/2408.09537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Sample-Optimal Large-Scale Optimal Subset Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zaile Li</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Weiwei Fan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L+J">L. Jeff Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09537v1-abstract-short" style="display: inline;"> Ranking and selection (R&S) conventionally aims to select the unique best alternative with the largest mean performance from a finite set of alternatives. However, for better supporting decision making, it may be more informative to deliver a small menu of alternatives whose mean performances are among the top $m$. Such problem, called optimal subset selection (OSS), is generally more challenging… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09537v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09537v1-abstract-full" style="display: none;"> Ranking and selection (R&S) conventionally aims to select the unique best alternative with the largest mean performance from a finite set of alternatives. However, for better supporting decision making, it may be more informative to deliver a small menu of alternatives whose mean performances are among the top $m$. Such problem, called optimal subset selection (OSS), is generally more challenging to address than the conventional R&S. This challenge becomes even more significant when the number of alternatives is considerably large. Thus, the focus of this paper is on addressing the large-scale OSS problem. To achieve this goal, we design a top-$m$ greedy selection mechanism that keeps sampling the current top $m$ alternatives with top $m$ running sample means and propose the explore-first top-$m$ greedy (EFG-$m$) procedure. Through an extended boundary-crossing framework, we prove that the EFG-$m$ procedure is both sample optimal and consistent in terms of the probability of good selection, confirming its effectiveness in solving large-scale OSS problem. Surprisingly, we also demonstrate that the EFG-$m$ procedure enables to achieve an indifference-based ranking within the selected subset of alternatives at no extra cost. This is highly beneficial as it delivers deeper insights to decision-makers, enabling more informed decision-makings. Lastly, numerical experiments validate our results and demonstrate the efficiency of our procedures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09537v1-abstract-full').style.display = 'none'; document.getElementById('2408.09537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06391">arXiv:2408.06391</a> <span> [<a href="https://arxiv.org/pdf/2408.06391">pdf</a>, <a href="https://arxiv.org/format/2408.06391">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Enzyme Function Prediction with Multi-scale Multi-modality Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rong%2C+D">Dingyi Rong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wenzhuo Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+B">Bozitao Zhong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhouhan Lin</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ning Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06391v1-abstract-short" style="display: inline;"> Accurate prediction of enzyme function is crucial for elucidating biological mechanisms and driving innovation across various sectors. Existing deep learning methods tend to rely solely on either sequence data or structural data and predict the EC number as a whole, neglecting the intrinsic hierarchical structure of EC numbers. To address these limitations, we introduce MAPred, a novel multi-modal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06391v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06391v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06391v1-abstract-full" style="display: none;"> Accurate prediction of enzyme function is crucial for elucidating biological mechanisms and driving innovation across various sectors. Existing deep learning methods tend to rely solely on either sequence data or structural data and predict the EC number as a whole, neglecting the intrinsic hierarchical structure of EC numbers. To address these limitations, we introduce MAPred, a novel multi-modality and multi-scale model designed to autoregressively predict the EC number of proteins. MAPred integrates both the primary amino acid sequence and the 3D tokens of proteins, employing a dual-pathway approach to capture comprehensive protein characteristics and essential local functional sites. Additionally, MAPred utilizes an autoregressive prediction network to sequentially predict the digits of the EC number, leveraging the hierarchical organization of EC classifications. Evaluations on benchmark datasets, including New-392, Price, and New-815, demonstrate that our method outperforms existing models, marking a significant advance in the reliability and granularity of protein function prediction within bioinformatics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06391v1-abstract-full').style.display = 'none'; document.getElementById('2408.06391v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05745">arXiv:2408.05745</a> <span> [<a href="https://arxiv.org/pdf/2408.05745">pdf</a>, <a href="https://arxiv.org/format/2408.05745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Improving Adversarial Transferability with Neighbourhood Gradient Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haijing Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiafeng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kaixun Jiang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pinxue Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinglun Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05745v1-abstract-short" style="display: inline;"> Deep neural networks (DNNs) are known to be susceptible to adversarial examples, leading to significant performance degradation. In black-box attack scenarios, a considerable attack performance gap between the surrogate model and the target model persists. This work focuses on enhancing the transferability of adversarial examples to narrow this performance gap. We observe that the gradient informa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05745v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05745v1-abstract-full" style="display: none;"> Deep neural networks (DNNs) are known to be susceptible to adversarial examples, leading to significant performance degradation. In black-box attack scenarios, a considerable attack performance gap between the surrogate model and the target model persists. This work focuses on enhancing the transferability of adversarial examples to narrow this performance gap. We observe that the gradient information around the clean image, i.e. Neighbourhood Gradient Information, can offer high transferability. Leveraging this, we propose the NGI-Attack, which incorporates Example Backtracking and Multiplex Mask strategies, to use this gradient information and enhance transferability fully. Specifically, we first adopt Example Backtracking to accumulate Neighbourhood Gradient Information as the initial momentum term. Multiplex Mask, which forms a multi-way attack strategy, aims to force the network to focus on non-discriminative regions, which can obtain richer gradient information during only a few iterations. Extensive experiments demonstrate that our approach significantly enhances adversarial transferability. Especially, when attacking numerous defense models, we achieve an average attack success rate of 95.8%. Notably, our method can plugin with any off-the-shelf algorithm to improve their attack performance without additional time cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05745v1-abstract-full').style.display = 'none'; document.getElementById('2408.05745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04381">arXiv:2408.04381</a> <span> [<a href="https://arxiv.org/pdf/2408.04381">pdf</a>, <a href="https://arxiv.org/format/2408.04381">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Understanding and Modeling Job Marketplace with Pretrained Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yaochen Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Liang Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Binchi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liangjie Hong</a>, <a href="/search/cs?searchtype=author&query=Simon%2C+L">Luke Simon</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04381v1-abstract-short" style="display: inline;"> Job marketplace is a heterogeneous graph composed of interactions among members (job-seekers), companies, and jobs. Understanding and modeling job marketplace can benefit both job seekers and employers, ultimately contributing to the greater good of the society. However, existing graph neural network (GNN)-based methods have shallow understandings of the associated textual features and heterogeneo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04381v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04381v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04381v1-abstract-full" style="display: none;"> Job marketplace is a heterogeneous graph composed of interactions among members (job-seekers), companies, and jobs. Understanding and modeling job marketplace can benefit both job seekers and employers, ultimately contributing to the greater good of the society. However, existing graph neural network (GNN)-based methods have shallow understandings of the associated textual features and heterogeneous relations. To address the above challenges, we propose PLM4Job, a job marketplace foundation model that tightly couples pretrained language models (PLM) with job market graph, aiming to fully utilize the pretrained knowledge and reasoning ability to model member/job textual features as well as various member-job relations simultaneously. In the pretraining phase, we propose a heterogeneous ego-graph-based prompting strategy to model and aggregate member/job textual features based on the topological structure around the target member/job node, where entity type embeddings and graph positional embeddings are introduced accordingly to model different entities and their heterogeneous relations. Meanwhile, a proximity-aware attention alignment strategy is designed to dynamically adjust the attention of the PLM on ego-graph node tokens in the prompt, such that the attention can be better aligned with job marketplace semantics. Extensive experiments at LinkedIn demonstrate the effectiveness of PLM4Job. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04381v1-abstract-full').style.display = 'none'; document.getElementById('2408.04381v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by CIKM'24 applied research track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00802">arXiv:2408.00802</a> <span> [<a href="https://arxiv.org/pdf/2408.00802">pdf</a>, <a href="https://arxiv.org/format/2408.00802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Leveraging LLM Reasoning Enhances Personalized Recommender Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tsai%2C+A+Y">Alicia Y. Tsai</a>, <a href="/search/cs?searchtype=author&query=Kraft%2C+A">Adam Kraft</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+L">Long Jin</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+C">Chenwei Cai</a>, <a href="/search/cs?searchtype=author&query=Hosseini%2C+A">Anahita Hosseini</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Taibai Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zemin Zhang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lichan Hong</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+E+H">Ed H. Chi</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xinyang Yi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00802v1-abstract-short" style="display: inline;"> Recent advancements have showcased the potential of Large Language Models (LLMs) in executing reasoning tasks, particularly facilitated by Chain-of-Thought (CoT) prompting. While tasks like arithmetic reasoning involve clear, definitive answers and logical chains of thought, the application of LLM reasoning in recommendation systems (RecSys) presents a distinct challenge. RecSys tasks revolve arou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00802v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00802v1-abstract-full" style="display: none;"> Recent advancements have showcased the potential of Large Language Models (LLMs) in executing reasoning tasks, particularly facilitated by Chain-of-Thought (CoT) prompting. While tasks like arithmetic reasoning involve clear, definitive answers and logical chains of thought, the application of LLM reasoning in recommendation systems (RecSys) presents a distinct challenge. RecSys tasks revolve around subjectivity and personalized preferences, an under-explored domain in utilizing LLMs' reasoning capabilities. Our study explores several aspects to better understand reasoning for RecSys and demonstrate how task quality improves by utilizing LLM reasoning in both zero-shot and finetuning settings. Additionally, we propose RecSAVER (Recommender Systems Automatic Verification and Evaluation of Reasoning) to automatically assess the quality of LLM reasoning responses without the requirement of curated gold references or human raters. We show that our framework aligns with real human judgment on the coherence and faithfulness of reasoning responses. Overall, our work shows that incorporating reasoning into RecSys can improve personalized tasks, paving the way for further advancements in recommender system methodologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00802v1-abstract-full').style.display = 'none'; document.getElementById('2408.00802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published at ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07443">arXiv:2407.07443</a> <span> [<a href="https://arxiv.org/pdf/2407.07443">pdf</a>, <a href="https://arxiv.org/format/2407.07443">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Secondary Structure-Guided Novel Protein Sequence Generation with Latent Graph Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yutong Hu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yang Tan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+A">Andi Han</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Lirong Zheng</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bingxin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07443v1-abstract-short" style="display: inline;"> The advent of deep learning has introduced efficient approaches for de novo protein sequence design, significantly improving success rates and reducing development costs compared to computational or experimental methods. However, existing methods face challenges in generating proteins with diverse lengths and shapes while maintaining key structural features. To address these challenges, we introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07443v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07443v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07443v1-abstract-full" style="display: none;"> The advent of deep learning has introduced efficient approaches for de novo protein sequence design, significantly improving success rates and reducing development costs compared to computational or experimental methods. However, existing methods face challenges in generating proteins with diverse lengths and shapes while maintaining key structural features. To address these challenges, we introduce CPDiffusion-SS, a latent graph diffusion model that generates protein sequences based on coarse-grained secondary structural information. CPDiffusion-SS offers greater flexibility in producing a variety of novel amino acid sequences while preserving overall structural constraints, thus enhancing the reliability and diversity of generated proteins. Experimental analyses demonstrate the significant superiority of the proposed method in producing diverse and novel sequences, with CPDiffusion-SS surpassing popular baseline methods on open benchmarks across various quantitative measurements. Furthermore, we provide a series of case studies to highlight the biological significance of the generation performance by the proposed method. The source code is publicly available at https://github.com/riacd/CPDiffusion-SS <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07443v1-abstract-full').style.display = 'none'; document.getElementById('2407.07443v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19755">arXiv:2406.19755</a> <span> [<a href="https://arxiv.org/pdf/2406.19755">pdf</a>, <a href="https://arxiv.org/format/2406.19755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Protein Representation Learning with Sequence Information Embedding: Does it Always Lead to a Better Performance? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yang Tan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Lirong Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+B">Bozitao Zhong</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bingxin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19755v1-abstract-short" style="display: inline;"> Deep learning has become a crucial tool in studying proteins. While the significance of modeling protein structure has been discussed extensively in the literature, amino acid types are typically included in the input as a default operation for many inference tasks. This study demonstrates with structure alignment task that embedding amino acid types in some cases may not help a deep learning mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19755v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19755v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19755v1-abstract-full" style="display: none;"> Deep learning has become a crucial tool in studying proteins. While the significance of modeling protein structure has been discussed extensively in the literature, amino acid types are typically included in the input as a default operation for many inference tasks. This study demonstrates with structure alignment task that embedding amino acid types in some cases may not help a deep learning model learn better representation. To this end, we propose ProtLOCA, a local geometry alignment method based solely on amino acid structure representation. The effectiveness of ProtLOCA is examined by a global structure-matching task on protein pairs with an independent test dataset based on CATH labels. Our method outperforms existing sequence- and structure-based representation learning methods by more quickly and accurately matching structurally consistent protein domains. Furthermore, in local structure pairing tasks, ProtLOCA for the first time provides a valid solution to highlight common local structures among proteins with different overall structures but the same function. This suggests a new possibility for using deep learning methods to analyze protein structure to infer function. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19755v1-abstract-full').style.display = 'none'; document.getElementById('2406.19755v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17626">arXiv:2406.17626</a> <span> [<a href="https://arxiv.org/pdf/2406.17626">pdf</a>, <a href="https://arxiv.org/format/2406.17626">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CoSafe: Evaluating Large Language Model Safety in Multi-Turn Dialogue Coreference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+E">Erxin Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+M">Ming Liao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siqi Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zuchen Gao</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+F">Fei Mi</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17626v1-abstract-short" style="display: inline;"> As large language models (LLMs) constantly evolve, ensuring their safety remains a critical research problem. Previous red-teaming approaches for LLM safety have primarily focused on single prompt attacks or goal hijacking. To the best of our knowledge, we are the first to study LLM safety in multi-turn dialogue coreference. We created a dataset of 1,400 questions across 14 categories, each featur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17626v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17626v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17626v1-abstract-full" style="display: none;"> As large language models (LLMs) constantly evolve, ensuring their safety remains a critical research problem. Previous red-teaming approaches for LLM safety have primarily focused on single prompt attacks or goal hijacking. To the best of our knowledge, we are the first to study LLM safety in multi-turn dialogue coreference. We created a dataset of 1,400 questions across 14 categories, each featuring multi-turn coreference safety attacks. We then conducted detailed evaluations on five widely used open-source LLMs. The results indicated that under multi-turn coreference safety attacks, the highest attack success rate was 56% with the LLaMA2-Chat-7b model, while the lowest was 13.9% with the Mistral-7B-Instruct model. These findings highlight the safety vulnerabilities in LLMs during dialogue coreference interactions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17626v1-abstract-full').style.display = 'none'; document.getElementById('2406.17626v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14475">arXiv:2405.14475</a> <span> [<a href="https://arxiv.org/pdf/2405.14475">pdf</a>, <a href="https://arxiv.org/format/2405.14475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MagicDrive3D: Controllable 3D Generation for Any-View Rendering in Street Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+R">Ruiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhihao Li</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14475v3-abstract-short" style="display: inline;"> While controllable generative models for images and videos have achieved remarkable success, high-quality models for 3D scenes, particularly in unbounded scenarios like autonomous driving, remain underdeveloped due to high data acquisition costs. In this paper, we introduce MagicDrive3D, a novel pipeline for controllable 3D street scene generation that supports multi-condition control, including B… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14475v3-abstract-full').style.display = 'inline'; document.getElementById('2405.14475v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14475v3-abstract-full" style="display: none;"> While controllable generative models for images and videos have achieved remarkable success, high-quality models for 3D scenes, particularly in unbounded scenarios like autonomous driving, remain underdeveloped due to high data acquisition costs. In this paper, we introduce MagicDrive3D, a novel pipeline for controllable 3D street scene generation that supports multi-condition control, including BEV maps, 3D objects, and text descriptions. Unlike previous methods that reconstruct before training the generative models, MagicDrive3D first trains a video generation model and then reconstructs from the generated data. This innovative approach enables easily controllable generation and static scene acquisition, resulting in high-quality scene reconstruction. To address the minor errors in generated content, we propose deformable Gaussian splatting with monocular depth initialization and appearance modeling to manage exposure discrepancies across viewpoints. Validated on the nuScenes dataset, MagicDrive3D generates diverse, high-quality 3D driving scenes that support any-view rendering and enhance downstream tasks like BEV segmentation. Our results demonstrate the framework's superior performance, showcasing its potential for autonomous driving simulation and beyond. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14475v3-abstract-full').style.display = 'none'; document.getElementById('2405.14475v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://flymin.github.io/magicdrive3d</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.07893">arXiv:2405.07893</a> <span> [<a href="https://arxiv.org/pdf/2405.07893">pdf</a>, <a href="https://arxiv.org/format/2405.07893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Science based AI model certification for new operational environments with application in traffic state estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mupupuni%2C+D">Daryl Mupupuni</a>, <a href="/search/cs?searchtype=author&query=Guntu%2C+A">Anupama Guntu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+K">Kamrul Hasan</a>, <a href="/search/cs?searchtype=author&query=Keel%2C+L">Leehyun Keel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.07893v1-abstract-short" style="display: inline;"> The expanding role of Artificial Intelligence (AI) in diverse engineering domains highlights the challenges associated with deploying AI models in new operational environments, involving substantial investments in data collection and model training. Rapid application of AI necessitates evaluating the feasibility of utilizing pre-trained models in unobserved operational settings with minimal or no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07893v1-abstract-full').style.display = 'inline'; document.getElementById('2405.07893v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.07893v1-abstract-full" style="display: none;"> The expanding role of Artificial Intelligence (AI) in diverse engineering domains highlights the challenges associated with deploying AI models in new operational environments, involving substantial investments in data collection and model training. Rapid application of AI necessitates evaluating the feasibility of utilizing pre-trained models in unobserved operational settings with minimal or no additional data. However, interpreting the opaque nature of AI's black-box models remains a persistent challenge. Addressing this issue, this paper proposes a science-based certification methodology to assess the viability of employing pre-trained data-driven models in new operational environments. The methodology advocates a profound integration of domain knowledge, leveraging theoretical and analytical models from physics and related disciplines, with data-driven AI models. This novel approach introduces tools to facilitate the development of secure engineering systems, providing decision-makers with confidence in the trustworthiness and safety of AI-based models across diverse environments characterized by limited training data and dynamic, uncertain conditions. The paper demonstrates the efficacy of this methodology in real-world safety-critical scenarios, particularly in the context of traffic state estimation. Through simulation results, the study illustrates how the proposed methodology efficiently quantifies physical inconsistencies exhibited by pre-trained AI models. By utilizing analytical models, the methodology offers a means to gauge the applicability of pre-trained AI models in new operational environments. This research contributes to advancing the understanding and deployment of AI models, offering a robust certification framework that enhances confidence in their reliability and safety across a spectrum of operational conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07893v1-abstract-full').style.display = 'none'; document.getElementById('2405.07893v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 Pages, 5 figures, \c{opyright}2024 IEEE INTERNATIONAL CONFERENCE on ELECTRO/INFORMATION TECHNOLOGY</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> EIT2024-082 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00557">arXiv:2405.00557</a> <span> [<a href="https://arxiv.org/pdf/2405.00557">pdf</a>, <a href="https://arxiv.org/format/2405.00557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mixture of insighTful Experts (MoTE): The Synergy of Thought Chains and Expert Mixtures in Self-Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhili Liu</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+Y">Yunhao Gou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jiahui Gao</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+F">Fei Mi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xin Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qun Liu</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James T. Kwok</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00557v3-abstract-short" style="display: inline;"> As the capabilities of large language models (LLMs) have expanded dramatically, aligning these models with human values presents a significant challenge. Traditional alignment strategies rely heavily on human intervention, such as Supervised Fine-Tuning (SFT) and Reinforcement Learning from Human Feedback (RLHF), or on the self-alignment capacities of LLMs, which usually require a strong LLM's eme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00557v3-abstract-full').style.display = 'inline'; document.getElementById('2405.00557v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00557v3-abstract-full" style="display: none;"> As the capabilities of large language models (LLMs) have expanded dramatically, aligning these models with human values presents a significant challenge. Traditional alignment strategies rely heavily on human intervention, such as Supervised Fine-Tuning (SFT) and Reinforcement Learning from Human Feedback (RLHF), or on the self-alignment capacities of LLMs, which usually require a strong LLM's emergent ability to improve its original bad answer. To address these challenges, we propose a novel self-alignment method that utilizes a Chain of Thought (CoT) approach, termed AlignCoT. This method encompasses stages of Question Analysis, Answer Guidance, and Safe Answer production. It is designed to enable LLMs to generate high-quality, safe responses throughout various stages of their development. Furthermore, we introduce the Mixture of insighTful Experts (MoTE) architecture, which applies mixture of experts to enhance each component of the AlignCoT process, markedly increasing alignment efficiency. The MoTE approach not only outperforms existing methods in aligning LLMs with human values but also highlights the benefits of using self-generated data, revealing the dual benefits of improved alignment and training efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00557v3-abstract-full').style.display = 'none'; document.getElementById('2405.00557v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19326">arXiv:2404.19326</a> <span> [<a href="https://arxiv.org/pdf/2404.19326">pdf</a>, <a href="https://arxiv.org/format/2404.19326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LVOS: A Benchmark for Large-scale Long-term Video Object Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhongying Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenchao Chen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chenzhi Tan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuang Feng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pinxue Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinglun Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shuyong Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19326v2-abstract-short" style="display: inline;"> Video object segmentation (VOS) aims to distinguish and track target objects in a video. Despite the excellent performance achieved by off-the-shell VOS models, existing VOS benchmarks mainly focus on short-term videos lasting about 5 seconds, where objects remain visible most of the time. However, these benchmarks poorly represent practical applications, and the absence of long-term datasets rest… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19326v2-abstract-full').style.display = 'inline'; document.getElementById('2404.19326v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19326v2-abstract-full" style="display: none;"> Video object segmentation (VOS) aims to distinguish and track target objects in a video. Despite the excellent performance achieved by off-the-shell VOS models, existing VOS benchmarks mainly focus on short-term videos lasting about 5 seconds, where objects remain visible most of the time. However, these benchmarks poorly represent practical applications, and the absence of long-term datasets restricts further investigation of VOS in realistic scenarios. Thus, we propose a novel benchmark named LVOS, comprising 720 videos with 296,401 frames and 407,945 high-quality annotations. Videos in LVOS last 1.14 minutes on average, approximately 5 times longer than videos in existing datasets. Each video includes various attributes, especially challenges deriving from the wild, such as long-term reappearing and cross-temporal similar objects. Compared to previous benchmarks, our LVOS better reflects VOS models' performance in real scenarios. Based on LVOS, we evaluate 20 existing VOS models under 4 different settings and conduct a comprehensive analysis. On LVOS, these models suffer a large performance drop, highlighting the challenge of achieving precise tracking and segmentation in real-world scenarios. Attribute-based analysis indicates that key factor to accuracy decline is the increased video length, emphasizing LVOS's crucial role. We hope our LVOS can advance development of VOS in real scenes. Data and code are available at https://lingyihongfd.github.io/lvos.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19326v2-abstract-full').style.display = 'none'; document.getElementById('2404.19326v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">LVOS V2</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14850">arXiv:2404.14850</a> <span> [<a href="https://arxiv.org/pdf/2404.14850">pdf</a>, <a href="https://arxiv.org/format/2404.14850">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Simple, Efficient and Scalable Structure-aware Adapter Boosts Protein Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yang Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingchen Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bingxin Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+B">Bozitao Zhong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Lirong Zheng</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+P">Pan Tan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Huiqun Yu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+G">Guisheng Fan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14850v1-abstract-short" style="display: inline;"> Fine-tuning Pre-trained protein language models (PLMs) has emerged as a prominent strategy for enhancing downstream prediction tasks, often outperforming traditional supervised learning approaches. As a widely applied powerful technique in natural language processing, employing Parameter-Efficient Fine-Tuning techniques could potentially enhance the performance of PLMs. However, the direct transfe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14850v1-abstract-full').style.display = 'inline'; document.getElementById('2404.14850v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14850v1-abstract-full" style="display: none;"> Fine-tuning Pre-trained protein language models (PLMs) has emerged as a prominent strategy for enhancing downstream prediction tasks, often outperforming traditional supervised learning approaches. As a widely applied powerful technique in natural language processing, employing Parameter-Efficient Fine-Tuning techniques could potentially enhance the performance of PLMs. However, the direct transfer to life science tasks is non-trivial due to the different training strategies and data forms. To address this gap, we introduce SES-Adapter, a simple, efficient, and scalable adapter method for enhancing the representation learning of PLMs. SES-Adapter incorporates PLM embeddings with structural sequence embeddings to create structure-aware representations. We show that the proposed method is compatible with different PLM architectures and across diverse tasks. Extensive evaluations are conducted on 2 types of folding structures with notable quality differences, 9 state-of-the-art baselines, and 9 benchmark datasets across distinct downstream tasks. Results show that compared to vanilla PLMs, SES-Adapter improves downstream task performance by a maximum of 11% and an average of 3%, with significantly accelerated training speed by a maximum of 1034% and an average of 362%, the convergence rate is also improved by approximately 2 times. Moreover, positive optimization is observed even with low-quality predicted structures. The source code for SES-Adapter is available at https://github.com/tyang816/SES-Adapter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14850v1-abstract-full').style.display = 'none'; document.getElementById('2404.14850v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 4 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10595">arXiv:2404.10595</a> <span> [<a href="https://arxiv.org/pdf/2404.10595">pdf</a>, <a href="https://arxiv.org/format/2404.10595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automated Evaluation of Large Vision-Language Models on Self-driving Corner Cases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanze Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenhua Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanxin Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengxiang Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+R">Ruiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+M">Meng Tian</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinhai Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+D">Dit-Yan Yeung</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xu Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10595v5-abstract-short" style="display: inline;"> Large Vision-Language Models (LVLMs) have received widespread attention for advancing the interpretable self-driving. Existing evaluations of LVLMs primarily focus on multi-faceted capabilities in natural circumstances, lacking automated and quantifiable assessment for self-driving, let alone the severe road corner cases. In this work, we propose CODA-LM, the very first benchmark for the automatic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10595v5-abstract-full').style.display = 'inline'; document.getElementById('2404.10595v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10595v5-abstract-full" style="display: none;"> Large Vision-Language Models (LVLMs) have received widespread attention for advancing the interpretable self-driving. Existing evaluations of LVLMs primarily focus on multi-faceted capabilities in natural circumstances, lacking automated and quantifiable assessment for self-driving, let alone the severe road corner cases. In this work, we propose CODA-LM, the very first benchmark for the automatic evaluation of LVLMs for self-driving corner cases. We adopt a hierarchical data structure and prompt powerful LVLMs to analyze complex driving scenes and generate high-quality pre-annotations for the human annotators, while for LVLM evaluation, we show that using the text-only large language models (LLMs) as judges reveals even better alignment with human preferences than the LVLM judges. Moreover, with our CODA-LM, we build CODA-VLM, a new driving LVLM surpassing all open-sourced counterparts on CODA-LM. Our CODA-VLM performs comparably with GPT-4V, even surpassing GPT-4V by +21.42% on the regional perception task. We hope CODA-LM can become the catalyst to promote interpretable self-driving empowered by LVLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10595v5-abstract-full').style.display = 'none'; document.getElementById('2404.10595v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept by WACV 2025. Project Page: https://coda-dataset.github.io/coda-lm/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00245">arXiv:2404.00245</a> <span> [<a href="https://arxiv.org/pdf/2404.00245">pdf</a>, <a href="https://arxiv.org/format/2404.00245">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Aligning Large Language Models with Recommendation Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuwei Cao</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+N">Nikhil Mehta</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xinyang Yi</a>, <a href="/search/cs?searchtype=author&query=Keshavan%2C+R">Raghunandan Keshavan</a>, <a href="/search/cs?searchtype=author&query=Heldt%2C+L">Lukasz Heldt</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lichan Hong</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+E+H">Ed H. Chi</a>, <a href="/search/cs?searchtype=author&query=Sathiamoorthy%2C+M">Maheswaran Sathiamoorthy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00245v1-abstract-short" style="display: inline;"> Large language models (LLMs) have recently been used as backbones for recommender systems. However, their performance often lags behind conventional methods in standard tasks like retrieval. We attribute this to a mismatch between LLMs' knowledge and the knowledge crucial for effective recommendations. While LLMs excel at natural language reasoning, they cannot model complex user-item interactions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00245v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00245v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00245v1-abstract-full" style="display: none;"> Large language models (LLMs) have recently been used as backbones for recommender systems. However, their performance often lags behind conventional methods in standard tasks like retrieval. We attribute this to a mismatch between LLMs' knowledge and the knowledge crucial for effective recommendations. While LLMs excel at natural language reasoning, they cannot model complex user-item interactions inherent in recommendation tasks. We propose bridging the knowledge gap and equipping LLMs with recommendation-specific knowledge to address this. Operations such as Masked Item Modeling (MIM) and Bayesian Personalized Ranking (BPR) have found success in conventional recommender systems. Inspired by this, we simulate these operations through natural language to generate auxiliary-task data samples that encode item correlations and user preferences. Fine-tuning LLMs on such auxiliary-task data samples and incorporating more informative recommendation-task data samples facilitates the injection of recommendation-specific knowledge into LLMs. Extensive experiments across retrieval, ranking, and rating prediction tasks on LLMs such as FLAN-T5-Base and FLAN-T5-XL show the effectiveness of our technique in domains such as Amazon Toys & Games, Beauty, and Sports & Outdoors. Notably, our method outperforms conventional and LLM-based baselines, including the current SOTA, by significant margins in retrieval, showcasing its potential for enhancing recommendation quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00245v1-abstract-full').style.display = 'none'; document.getElementById('2404.00245v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the NAACL 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17146">arXiv:2403.17146</a> <span> [<a href="https://arxiv.org/pdf/2403.17146">pdf</a>, <a href="https://arxiv.org/format/2403.17146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Outcome-Constrained Large Language Models for Countering Hate Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingzi Hong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Pengcheng Luo</a>, <a href="/search/cs?searchtype=author&query=Blanco%2C+E">Eduardo Blanco</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoying Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17146v2-abstract-short" style="display: inline;"> Automatic counterspeech generation methods have been developed to assist efforts in combating hate speech. Existing research focuses on generating counterspeech with linguistic attributes such as being polite, informative, and intent-driven. However, the real impact of counterspeech in online environments is seldom considered. This study aims to develop methods for generating counterspeech constra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17146v2-abstract-full').style.display = 'inline'; document.getElementById('2403.17146v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17146v2-abstract-full" style="display: none;"> Automatic counterspeech generation methods have been developed to assist efforts in combating hate speech. Existing research focuses on generating counterspeech with linguistic attributes such as being polite, informative, and intent-driven. However, the real impact of counterspeech in online environments is seldom considered. This study aims to develop methods for generating counterspeech constrained by conversation outcomes and evaluate their effectiveness. We experiment with large language models (LLMs) to incorporate into the text generation process two desired conversation outcomes: low conversation incivility and non-hateful hater reentry. Specifically, we experiment with instruction prompts, LLM finetuning, and LLM reinforcement learning (RL). Evaluation results show that our methods effectively steer the generation of counterspeech toward the desired outcomes. Our analyses, however, show that there are differences in the quality and style depending on the model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17146v2-abstract-full').style.display = 'none'; document.getElementById('2403.17146v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for presentation at the EMNLP 2024 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.16885">arXiv:2403.16885</a> <span> [<a href="https://arxiv.org/pdf/2403.16885">pdf</a>, <a href="https://arxiv.org/format/2403.16885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CVT-xRF: Contrastive In-Voxel Transformer for 3D Consistent Radiance Fields from Sparse Inputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yingji Zhong</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.16885v1-abstract-short" style="display: inline;"> Neural Radiance Fields (NeRF) have shown impressive capabilities for photorealistic novel view synthesis when trained on dense inputs. However, when trained on sparse inputs, NeRF typically encounters issues of incorrect density or color predictions, mainly due to insufficient coverage of the scene causing partial and sparse supervision, thus leading to significant performance degradation. While e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16885v1-abstract-full').style.display = 'inline'; document.getElementById('2403.16885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.16885v1-abstract-full" style="display: none;"> Neural Radiance Fields (NeRF) have shown impressive capabilities for photorealistic novel view synthesis when trained on dense inputs. However, when trained on sparse inputs, NeRF typically encounters issues of incorrect density or color predictions, mainly due to insufficient coverage of the scene causing partial and sparse supervision, thus leading to significant performance degradation. While existing works mainly consider ray-level consistency to construct 2D learning regularization based on rendered color, depth, or semantics on image planes, in this paper we propose a novel approach that models 3D spatial field consistency to improve NeRF's performance with sparse inputs. Specifically, we first adopt a voxel-based ray sampling strategy to ensure that the sampled rays intersect with a certain voxel in 3D space. We then randomly sample additional points within the voxel and apply a Transformer to infer the properties of other points on each ray, which are then incorporated into the volume rendering. By backpropagating through the rendering loss, we enhance the consistency among neighboring points. Additionally, we propose to use a contrastive loss on the encoder output of the Transformer to further improve consistency within each voxel. Experiments demonstrate that our method yields significant improvement over different radiance fields in the sparse inputs setting, and achieves comparable performance with current works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16885v1-abstract-full').style.display = 'none'; document.getElementById('2403.16885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper is accepted by CVPR 2024. Project page is available at https://zhongyingji.github.io/CVT-xRF</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14093">arXiv:2403.14093</a> <span> [<a href="https://arxiv.org/pdf/2403.14093">pdf</a>, <a href="https://arxiv.org/format/2403.14093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Science based AI model certification for untrained operational environments with application in traffic state estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mupupuni%2C+D">Daryl Mupupuni</a>, <a href="/search/cs?searchtype=author&query=Guntu%2C+A">Anupama Guntu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+K">Kamrul Hasan</a>, <a href="/search/cs?searchtype=author&query=Keel%2C+L">Leehyun Keel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14093v1-abstract-short" style="display: inline;"> The expanding role of Artificial Intelligence (AI) in diverse engineering domains highlights the challenges associated with deploying AI models in new operational environments, involving substantial investments in data collection and model training. Rapid application of AI necessitates evaluating the feasibility of utilizing pre-trained models in unobserved operational settings with minimal or no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14093v1-abstract-full').style.display = 'inline'; document.getElementById('2403.14093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14093v1-abstract-full" style="display: none;"> The expanding role of Artificial Intelligence (AI) in diverse engineering domains highlights the challenges associated with deploying AI models in new operational environments, involving substantial investments in data collection and model training. Rapid application of AI necessitates evaluating the feasibility of utilizing pre-trained models in unobserved operational settings with minimal or no additional data. However, interpreting the opaque nature of AI's black-box models remains a persistent challenge. Addressing this issue, this paper proposes a science-based certification methodology to assess the viability of employing pre-trained data-driven models in untrained operational environments. The methodology advocates a profound integration of domain knowledge, leveraging theoretical and analytical models from physics and related disciplines, with data-driven AI models. This novel approach introduces tools to facilitate the development of secure engineering systems, providing decision-makers with confidence in the trustworthiness and safety of AI-based models across diverse environments characterized by limited training data and dynamic, uncertain conditions. The paper demonstrates the efficacy of this methodology in real-world safety-critical scenarios, particularly in the context of traffic state estimation. Through simulation results, the study illustrates how the proposed methodology efficiently quantifies physical inconsistencies exhibited by pre-trained AI models. By utilizing analytical models, the methodology offers a means to gauge the applicability of pre-trained AI models in new operational environments. This research contributes to advancing the understanding and deployment of AI models, offering a robust certification framework that enhances confidence in their reliability and safety across a spectrum of operational conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14093v1-abstract-full').style.display = 'none'; document.getElementById('2403.14093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13304">arXiv:2403.13304</a> <span> [<a href="https://arxiv.org/pdf/2403.13304">pdf</a>, <a href="https://arxiv.org/format/2403.13304">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DetDiffusion: Synergizing Generative and Perceptive Models for Enhanced Data Generation and Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yibo Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+R">Ruiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kaiqiang Zhou</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yingjie Cai</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lihui Jiang</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+D">Dit-Yan Yeung</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13304v1-abstract-short" style="display: inline;"> Current perceptive models heavily depend on resource-intensive datasets, prompting the need for innovative solutions. Leveraging recent advances in diffusion models, synthetic data, by constructing image inputs from various annotations, proves beneficial for downstream tasks. While prior methods have separately addressed generative and perceptive models, DetDiffusion, for the first time, harmonize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13304v1-abstract-full').style.display = 'inline'; document.getElementById('2403.13304v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13304v1-abstract-full" style="display: none;"> Current perceptive models heavily depend on resource-intensive datasets, prompting the need for innovative solutions. Leveraging recent advances in diffusion models, synthetic data, by constructing image inputs from various annotations, proves beneficial for downstream tasks. While prior methods have separately addressed generative and perceptive models, DetDiffusion, for the first time, harmonizes both, tackling the challenges in generating effective data for perceptive models. To enhance image generation with perceptive models, we introduce perception-aware loss (P.A. loss) through segmentation, improving both quality and controllability. To boost the performance of specific perceptive models, our method customizes data augmentation by extracting and utilizing perception-aware attribute (P.A. Attr) during generation. Experimental results from the object detection task highlight DetDiffusion's superior performance, establishing a new state-of-the-art in layout-guided generation. Furthermore, image syntheses from DetDiffusion can effectively augment training data, significantly enhancing downstream detection performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13304v1-abstract-full').style.display = 'none'; document.getElementById('2403.13304v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.09836">arXiv:2403.09836</a> <span> [<a href="https://arxiv.org/pdf/2403.09836">pdf</a>, <a href="https://arxiv.org/format/2403.09836">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Empowering Healthcare through Privacy-Preserving MRI Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Amin%2C+A">Al Amin</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+K">Kamrul Hasan</a>, <a href="/search/cs?searchtype=author&query=Zein-Sabatto%2C+S">Saleh Zein-Sabatto</a>, <a href="/search/cs?searchtype=author&query=Chimba%2C+D">Deo Chimba</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Ahmed%2C+I">Imtiaz Ahmed</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+T">Tariqul Islam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.09836v1-abstract-short" style="display: inline;"> In the healthcare domain, Magnetic Resonance Imaging (MRI) assumes a pivotal role, as it employs Artificial Intelligence (AI) and Machine Learning (ML) methodologies to extract invaluable insights from imaging data. Nonetheless, the imperative need for patient privacy poses significant challenges when collecting data from diverse healthcare sources. Consequently, the Deep Learning (DL) communities… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09836v1-abstract-full').style.display = 'inline'; document.getElementById('2403.09836v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.09836v1-abstract-full" style="display: none;"> In the healthcare domain, Magnetic Resonance Imaging (MRI) assumes a pivotal role, as it employs Artificial Intelligence (AI) and Machine Learning (ML) methodologies to extract invaluable insights from imaging data. Nonetheless, the imperative need for patient privacy poses significant challenges when collecting data from diverse healthcare sources. Consequently, the Deep Learning (DL) communities occasionally face difficulties detecting rare features. In this research endeavor, we introduce the Ensemble-Based Federated Learning (EBFL) Framework, an innovative solution tailored to address this challenge. The EBFL framework deviates from the conventional approach by emphasizing model features over sharing sensitive patient data. This unique methodology fosters a collaborative and privacy-conscious environment for healthcare institutions, empowering them to harness the capabilities of a centralized server for model refinement while upholding the utmost data privacy standards.Conversely, a robust ensemble architecture boasts potent feature extraction capabilities, distinguishing itself from a single DL model. This quality makes it remarkably dependable for MRI analysis. By harnessing our groundbreaking EBFL methodology, we have achieved remarkable precision in the classification of brain tumors, including glioma, meningioma, pituitary, and non-tumor instances, attaining a precision rate of 94% for the Global model and an impressive 96% for the Ensemble model. Our models underwent rigorous evaluation using conventional performance metrics such as Accuracy, Precision, Recall, and F1 Score. Integrating DL within the Federated Learning (FL) framework has yielded a methodology that offers precise and dependable diagnostics for detecting brain tumors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09836v1-abstract-full').style.display = 'none'; document.getElementById('2403.09836v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.09634">arXiv:2403.09634</a> <span> [<a href="https://arxiv.org/pdf/2403.09634">pdf</a>, <a href="https://arxiv.org/format/2403.09634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OneTracker: Unifying Visual Object Tracking with Foundation Models and Efficient Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shilin Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wanyun Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pinxue Guo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kaixun Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiting Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinglun Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.09634v1-abstract-short" style="display: inline;"> Visual object tracking aims to localize the target object of each frame based on its initial appearance in the first frame. Depending on the input modility, tracking tasks can be divided into RGB tracking and RGB+X (e.g. RGB+N, and RGB+D) tracking. Despite the different input modalities, the core aspect of tracking is the temporal matching. Based on this common ground, we present a general framewo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09634v1-abstract-full').style.display = 'inline'; document.getElementById('2403.09634v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.09634v1-abstract-full" style="display: none;"> Visual object tracking aims to localize the target object of each frame based on its initial appearance in the first frame. Depending on the input modility, tracking tasks can be divided into RGB tracking and RGB+X (e.g. RGB+N, and RGB+D) tracking. Despite the different input modalities, the core aspect of tracking is the temporal matching. Based on this common ground, we present a general framework to unify various tracking tasks, termed as OneTracker. OneTracker first performs a large-scale pre-training on a RGB tracker called Foundation Tracker. This pretraining phase equips the Foundation Tracker with a stable ability to estimate the location of the target object. Then we regard other modality information as prompt and build Prompt Tracker upon Foundation Tracker. Through freezing the Foundation Tracker and only adjusting some additional trainable parameters, Prompt Tracker inhibits the strong localization ability from Foundation Tracker and achieves parameter-efficient finetuning on downstream RGB+X tracking tasks. To evaluate the effectiveness of our general framework OneTracker, which is consisted of Foundation Tracker and Prompt Tracker, we conduct extensive experiments on 6 popular tracking tasks across 11 benchmarks and our OneTracker outperforms other models and achieves state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09634v1-abstract-full').style.display = 'none'; document.getElementById('2403.09634v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hong%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hong%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hong%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hong%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Hong%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository